Merge pull request #827 from cal-itp/clean-up-segment-speed-utils

Quarterly metrics / clean up segment_speed_utils and bus_service_utils
cal-itp · Aug 4, 2023 · ef8d7c4 · ef8d7c4
2 parents 0419179 + 88f67c6
commit ef8d7c4
Show file tree

Hide file tree

Showing 29 changed files with 846 additions and 1,199 deletions.
diff --git a/_shared_utils/shared_utils/rt_dates.py b/_shared_utils/shared_utils/rt_dates.py
@@ -34,4 +34,5 @@
     "Q4_2022": "2022-10-12",
     "Q1_2023": "2023-01-18",
     "Q2_2023": "2023-04-12",
+    "Q3_2023": "2023-07-12",
 }
diff --git a/bus_service_increase/bus_service_utils/calenviroscreen_lehd_utils.py b/bus_service_increase/bus_service_utils/calenviroscreen_lehd_utils.py
@@ -235,7 +235,7 @@ def generate_calenviroscreen_lehd_data(
     # Merge together
     final = merge_calenviroscreen_lehd(gdf, lehd)
 
-    if GCS is True:
+    if GCS:
         shared_utils.utils.geoparquet_gcs_export(
             final,
             utils.GCS_FILE_PATH,

diff --git a/bus_service_increase/bus_service_utils/create_parallel_corridors.py b/bus_service_increase/bus_service_utils/create_parallel_corridors.py
@@ -144,7 +144,7 @@ def overlay_transit_to_highways(
         transit_routes, 
         highways, 
         how = "intersection", 
-        keep_geom_type = False
+        keep_geom_type = True
     )  
 
     # Using new geometry column, calculate what % that intersection 

diff --git a/bus_service_increase/bus_service_utils/utils.py b/bus_service_increase/bus_service_utils/utils.py
@@ -6,9 +6,6 @@
 import pandas as pd
 
 from calendar import THURSDAY, SATURDAY, SUNDAY
-from calitp_data.storage import get_fs
-
-fs = get_fs()
 
 GCS_PROJECT = "cal-itp-data-infra"
 BUCKET_NAME = "calitp-analytics-data"
@@ -17,34 +14,6 @@
 
 DATA_PATH = "./data/"
 IMG_PATH = "./img/"
-
-
-def import_csv_export_parquet(dataset_name: str, output_file_name: str, 
-                              GCS_FILE_PATH: str, GCS: bool = True):
-    """
-    DATASET_NAME: str. Name of csv dataset.
-    OUTPUT_FILE_NAME: str. Name of output parquet dataset.
-    GCS_FILE_PATH: str. Ex: gs://calitp-analytics-data/data-analyses/my-folder/
-    """
-    df = pd.read_csv(f"{dataset_name}.csv")
-
-    if GCS is True:
-        df.to_parquet(f"{GCS_FILE_PATH}{output_file_name}.parquet")
-    else:
-        df.to_parquet(f"./{output_file_name}.parquet")
-
-
-def import_export(DATASET_NAME: str, OUTPUT_FILE_NAME: str, GCS:bool=True): 
-    """
-    DATASET_NAME: str. Name of csv dataset.
-    OUTPUT_FILE_NAME: str. Name of output parquet dataset.
-    """
-    df = pd.read_csv(f"{DATASET_NAME}.csv")    
-
-    if GCS is True:
-        df.to_parquet(f"{GCS_FILE_PATH}{OUTPUT_FILE_NAME}.parquet")
-    else:
-        df.to_parquet(f"./{OUTPUT_FILE_NAME}.parquet")
 
 
 def get_recent_dates() -> dict:
@@ -63,57 +32,6 @@ def get_recent_dates() -> dict:
     return dict(zip(['thurs', 'sat', 'sun'], dates))
 
 
-# There are multiple feeds, with different trip_keys but same trip_ids
-# Only keep calitp_url_number == 0 EXCEPT LA Metro
-def include_exclude_multiple_feeds(df: pd.DataFrame, 
-                                   id_col: str = "itp_id",
-                                   include_ids: list = [182], 
-                                   exclude_ids: list = [200]) -> pd.DataFrame:
-    """
-    df: pandas.DataFrame.
-    id_col: str, column name for calitp_itp_id, such as "itp_id" 
-    include_ids: list, 
-            list of itp_ids that are allowed to have multiple feeds 
-            (Ex: LA Metro) 
-    exclude_ids: list, list of itp_ids to drop. (Ex: MTC, regional feed)
-    """
-    # If there are explicit regional feeds to drop, put that in exclude_ids
-    group_cols = list(df.columns)
-    dup_cols = [i for i in group_cols if i != "calitp_url_number"]
-
-    df2 = (df[~df[id_col].isin(exclude_ids)]
-           .sort_values(group_cols)
-           .drop_duplicates(subset=dup_cols)
-           .reset_index(drop=True)
-          )
-
-    print(f"# obs in original df: {len(df)}")
-    print(f"# obs in new df: {len(df2)}")
-
-    # There are still multiple operators here
-    # But, seems like for those trip_ids, they are different values 
-    # between url_number==0 vs url_number==1
-    multiple_urls = list(df2[df2.calitp_url_number==1][id_col].unique())
-    print(f"These operators have multiple calitp_url_number values: {multiple_urls}")    
-
-    return df2
-
-
-def fix_gtfs_time(gtfs_timestring: str) -> str:
-    '''Reformats a GTFS timestamp (which allows the hour to exceed 24 to 
-    mark service day continuity)
-    to standard 24-hour time.
-    '''
-    split = gtfs_timestring.split(':')
-    hour = int(split[0])
-    if hour >= 24:
-        split[0] = str(hour - 24)
-        corrected = (':').join(split)
-        return corrected.strip()
-    else:
-        return gtfs_timestring.strip()
-
-
 # https://stackoverflow.com/questions/25052980/use-pickle-to-save-dictionary-in-python
 def save_request_json(my_list: list, 
                       name: str, 

diff --git a/bus_service_increase/setup.py b/bus_service_increase/setup.py
@@ -3,7 +3,7 @@
 setup(
     name="bus_service_utils",
     packages=find_packages(),
-    version="1.0",
+    version="2.0",
     description="Shared utility functions for bus service data analyses",
     author="Cal-ITP",
     license="Apache",

diff --git a/open_data/create_routes_data.py b/open_data/create_routes_data.py
@@ -12,7 +12,7 @@
 
 import prep_traffic_ops
 from shared_utils import utils, geography_utils, portfolio_utils
-from segment_speed_utils import helpers, gtfs_schedule_wrangling
+from segment_speed_utils import helpers
 from update_vars import analysis_date, TRAFFIC_OPS_GCS
 
 
@@ -32,12 +32,13 @@ def create_routes_file_for_export(date: str) -> gpd.GeoDataFrame:
         crs = geography_utils.WGS84
     ).dropna(subset="shape_array_key")
 
-    df = gtfs_schedule_wrangling.merge_shapes_to_trips(
+    df = pd.merge(
         shapes,
         trips,
-        merge_cols = ["shape_array_key"]
+        on = "shape_array_key",
+        how = "inner"
     ).drop(columns = "trip_id").drop_duplicates(subset="shape_array_key")
-        
+
     df2 = remove_erroneous_shapes(df)    
 
     drop_cols = ["route_short_name", "route_long_name", "route_desc"]
@@ -83,6 +84,7 @@ def remove_erroneous_shapes(
 
     return ok_shapes
 
+
 def finalize_export_df(df: gpd.GeoDataFrame) -> gpd.GeoDataFrame:
     """
     Suppress certain columns used in our internal modeling for export.

diff --git a/open_data/create_stops_data.py b/open_data/create_stops_data.py
@@ -13,7 +13,7 @@
 
 import prep_traffic_ops
 from shared_utils import utils, geography_utils, schedule_rt_utils
-from segment_speed_utils import helpers, gtfs_schedule_wrangling
+from segment_speed_utils import helpers
 from update_vars import analysis_date, TRAFFIC_OPS_GCS
 
 
@@ -44,9 +44,11 @@ def attach_route_info_to_stops(
         .reset_index(drop=True)
     ).compute()
 
-    stops_with_geom = gtfs_schedule_wrangling.attach_stop_geometry(
+    stops_with_geom = dd.merge(
+        stops,
         stops_with_route_info,
-        stops
+        on = ["feed_key", "stop_id"],
+        how = "inner"
     )
 
     stops_assembled = (stops_with_geom

diff --git a/open_data/download_trips.py b/open_data/download_trips.py
@@ -23,7 +23,7 @@ def get_operators(analysis_date: str):
                 keep_cols = None,
                 get_df = True,
                 feed_option = "use_subfeeds"
-            ).rename(columns = {"_gtfs_dataset_name": "name"})
+            ).rename(columns = {"gtfs_dataset_name": "name"})
 
     keep_cols = ["feed_key", "name"]
 

diff --git a/portfolio/quarterly_performance_metrics/0__current_quarter_report__.ipynb b/portfolio/quarterly_performance_metrics/0__current_quarter_report__.ipynb
diff --git a/portfolio/quarterly_performance_metrics/0__historical_service_hours_v2__.ipynb b/portfolio/quarterly_performance_metrics/0__historical_service_hours_v2__.ipynb
diff --git a/portfolio/quarterly_performance_metrics/README.md b/portfolio/quarterly_performance_metrics/README.md
@@ -14,18 +14,19 @@ Initially presented for the Planning and Modal Advisory Committee (PMAC).
 ## Workflow
 ### Data Generation
 
-1. [Generate processed data for categories and service hours](https://github.com/cal-itp/data-analyses/blob/main/quarterly_performance_objective/A1_generate_routes_on_shn_data.py) with GTFS schedule data
-1. [Categorize routes into 3 groups](https://github.com/cal-itp/data-analyses/blob/main/quarterly_performance_objective/A2_categorize_routes.py)
-1. [Generate endpoint data processed data](https://github.com/cal-itp/data-analyses/blob/main/quarterly_performance_objective/A3_generate_endpoint_delay.py) with GTFS real-time data
-1. [Merge service hours and endpoint delay](https://github.com/cal-itp/data-analyses/blob/main/quarterly_performance_objective/A4_route_service_hours_delay.py)
+1. [Aggregate from shape level to route level](https://github.com/cal-itp/data-analyses/blob/main/quarterly_performance_objective/A1_scheduled_route_level_df)
+1. [Generate processed data for categories and service hours](https://github.com/cal-itp/data-analyses/blob/main/quarterly_performance_objective/A2_generate_routes_on_shn_data.py) with GTFS schedule data
+1. [Categorize routes into 3 groups](https://github.com/cal-itp/data-analyses/blob/main/quarterly_performance_objective/A3_categorize_routes.py)
+1. [Generate endpoint data processed data](https://github.com/cal-itp/data-analyses/blob/main/quarterly_performance_objective/B1_generate_endpoint_delay.py) with GTFS real-time data
+1. [Merge service hours and endpoint delay](https://github.com/cal-itp/data-analyses/blob/main/quarterly_performance_objective/B2_route_service_hours_delay.py)
 
 ### Helper Scripts for Reports
-1. [data prep functions](https://github.com/cal-itp/data-analyses/blob/main/quarterly_performance_objective/B1_report_metrics.py)
-1. [chart functions](https://github.com/cal-itp/data-analyses/blob/main/quarterly_performance_objective/B2_report_charts.py)
+1. [data prep functions](https://github.com/cal-itp/data-analyses/blob/main/quarterly_performance_objective/C1_report_metrics.py)
+1. [chart functions](https://github.com/cal-itp/data-analyses/blob/main/quarterly_performance_objective/C2_report_charts.py)
 
 ### Reports
 
 Create a report of current quarter's snapshot as well as a historical comparison of quarterly metrics report.
 
 1. [current quarter's snapshot](https://github.com/cal-itp/data-analyses/blob/main/quarterly_performance_objective/current_quarter_report.ipynb)
-1. [historical comparison](https://github.com/cal-itp/data-analyses/blob/main/quarterly_performance_objective/historical_report.ipynb)
+1. [historical comparison](https://github.com/cal-itp/data-analyses/blob/main/quarterly_performance_objective/historical_service_hours_v2.ipynb)
diff --git a/portfolio/quarterly_performance_metrics/_config.yml b/portfolio/quarterly_performance_metrics/_config.yml
@@ -35,7 +35,7 @@ html:
   use_issues_button: true
   use_repository_button: true
   use_edit_page_button: true
-  google_analytics_id: G-SZB618VNBZ
+  google_analytics_id: 'G-JCX3Z8JZJC'
 
 sphinx:
   config: