cal-itp · tiffanychu90 · Mar 14, 2024 · Mar 14, 2024 · Mar 14, 2024 · Mar 14, 2024
diff --git a/_shared_utils/shared_utils/rt_dates.py b/_shared_utils/shared_utils/rt_dates.py
@@ -51,6 +51,7 @@
     "dec2023": "2023-12-13",
     "jan2024": "2024-01-17",
     "feb2024": "2024-02-14",
+    "mar2024": "2024-03-13",
 }
 
 y2023_dates = [

diff --git a/gtfs_digest/merge_data.py b/gtfs_digest/merge_data.py
@@ -17,18 +17,16 @@ def concatenate_schedule_by_route_direction(
     Concatenate schedule data that's been 
     aggregated to route-direction-time_period.
     """
-    df = time_series_utils.concatenate_datasets_across_months(
+    df = time_series_utils.concatenate_datasets_across_dates(
         RT_SCHED_GCS,
         "schedule_route_dir/schedule_route_direction_metrics",
         date_list,
         data_type = "df",
         columns = route_time_cols + [
-            "avg_sched_service_min", 
+            "avg_sched_service_minutes", 
             "avg_stop_meters",
-            "n_trips", "frequency",]
-    ).sort_values(sort_cols).reset_index(drop=True).rename(
-        columns = {"n_trips": "n_scheduled_trips"}
-    )    
+            "n_scheduled_trips", "frequency"],
+    ).sort_values(sort_cols).reset_index(drop=True)
 
     return df
 
@@ -40,14 +38,14 @@ def concatenate_segment_speeds_by_route_direction(
     Concatenate segment speeds data that's been 
     aggregated to route-direction-time_period.
     """
-    df = time_series_utils.concatenate_datasets_across_months(
+    df = time_series_utils.concatenate_datasets_across_dates(
         SEGMENT_GCS,
         "rollup_singleday/speeds_route_dir_segments",
         date_list,
         data_type = "gdf",
         columns = route_time_cols + [
             "stop_pair", "p20_mph", "p50_mph", 
-            "p80_mph", "geometry"]
+            "p80_mph", "geometry"],
     ).sort_values(sort_cols).reset_index(drop=True)
 
     return df
@@ -60,12 +58,12 @@ def concatenate_speeds_by_route_direction(
     Concatenate rt vs schedule data that's been 
     aggregated to route-direction-time_period.
     """
-    df = time_series_utils.concatenate_datasets_across_months(
+    df = time_series_utils.concatenate_datasets_across_dates(
         SEGMENT_GCS,
         "rollup_singleday/speeds_route_dir",
         date_list,
         data_type = "df",
-        columns = route_time_cols + ["speed_mph"]
+        columns = route_time_cols + ["speed_mph"],
     ).sort_values(sort_cols).reset_index(drop=True)
 
     return df
@@ -75,14 +73,12 @@ def concatenate_rt_vs_schedule_by_route_direction(
     date_list: list
 ) -> pd.DataFrame:
 
-    df = time_series_utils.concatenate_datasets_across_months(
+    df = time_series_utils.concatenate_datasets_across_dates(
         RT_SCHED_GCS,
         "vp_route_dir/route_direction_metrics",
         date_list,
         data_type = "df",
-    ).sort_values(sort_cols).reset_index(drop=True).rename(
-        columns = {"n_trips": "vp_trips"}
-    )
+    ).sort_values(sort_cols).reset_index(drop=True)
 
     # We'll add this back in after merging
     # because these would be NaN if it's not in schedule

diff --git a/gtfs_funnel/Makefile b/gtfs_funnel/Makefile
@@ -6,15 +6,16 @@ download_gtfs_data:
 	python download_stop_times.py
 	python download_vehicle_positions.py
 	python concatenate_vehicle_positions.py      
-
+
+
 preprocess:
 	python stop_times_with_direction.py
+	python route_typologies.py
+	python crosswalk_gtfs_dataset_key_to_organization.py    
 	python vp_keep_usable.py
 	python vp_direction.py
-	python vp_condenser.py
 	python cleanup.py
-	python route_typologies.py
-	python crosswalk_gtfs_dataset_key_to_organization.py
+	python vp_condenser.py
 
 # Start compiling scripts needed when we start concatenating several days
 # Clean route names for displaying across time

diff --git a/gtfs_funnel/logs/download_data.log b/gtfs_funnel/logs/download_data.log
@@ -286,3 +286,35 @@
 2024-02-15 09:23:46.825 | INFO     | __main__:download_one_day:29 - # operators to run: 169
 2024-02-15 09:23:46.826 | INFO     | __main__:download_one_day:33 - *********** Download st data ***********
 2024-02-15 09:25:01.209 | INFO     | __main__:download_one_day:56 - execution time: 0:01:15.946993
+2024-03-14 09:04:12.795 | INFO     | __main__:download_one_day:46 - Analysis date: 2023-03-13
+2024-03-14 09:04:15.112 | INFO     | __main__:download_one_day:53 - # operators to run: 197
+2024-03-14 09:04:15.112 | INFO     | __main__:download_one_day:56 - *********** Download trips data ***********
+2024-03-14 09:04:39.494 | INFO     | __main__:download_one_day:86 - execution time: 0:00:26.673402
+2024-03-14 09:04:57.009 | INFO     | __main__:download_one_day:23 - Analysis date: 2023-03-13
+2024-03-14 09:04:58.829 | INFO     | __main__:download_one_day:30 - # operators to run: 197
+2024-03-14 09:04:58.830 | INFO     | __main__:download_one_day:33 - *********** Download stops data ***********
+2024-03-14 09:05:06.710 | INFO     | __main__:download_one_day:64 - execution time: 0:00:09.700498
+2024-03-14 09:05:23.556 | INFO     | __main__:download_one_day:22 - Analysis date: 2023-03-13
+2024-03-14 09:05:25.592 | INFO     | __main__:download_one_day:29 - # operators to run: 197
+2024-03-14 09:05:25.592 | INFO     | __main__:download_one_day:33 - *********** Download routelines data ***********
+2024-03-14 09:07:56.533 | INFO     | __main__:download_one_day:63 - execution time: 0:02:32.976430
+2024-03-14 09:08:13.702 | INFO     | __main__:download_one_day:21 - Analysis date: 2023-03-13
+2024-03-14 09:08:15.097 | INFO     | __main__:download_one_day:29 - # operators to run: 155
+2024-03-14 09:08:15.097 | INFO     | __main__:download_one_day:33 - *********** Download st data ***********
+2024-03-14 09:09:38.948 | INFO     | __main__:download_one_day:56 - execution time: 0:01:25.245238
+2024-03-14 11:40:02.601 | INFO     | __main__:download_one_day:46 - Analysis date: 2024-03-13
+2024-03-14 11:40:04.888 | INFO     | __main__:download_one_day:53 - # operators to run: 198
+2024-03-14 11:40:04.888 | INFO     | __main__:download_one_day:56 - *********** Download trips data ***********
+2024-03-14 11:40:27.756 | INFO     | __main__:download_one_day:86 - execution time: 0:00:25.154163
+2024-03-14 11:40:45.783 | INFO     | __main__:download_one_day:23 - Analysis date: 2024-03-13
+2024-03-14 11:40:47.912 | INFO     | __main__:download_one_day:30 - # operators to run: 198
+2024-03-14 11:40:47.913 | INFO     | __main__:download_one_day:33 - *********** Download stops data ***********
+2024-03-14 11:40:55.873 | INFO     | __main__:download_one_day:64 - execution time: 0:00:10.088939
+2024-03-14 11:41:13.626 | INFO     | __main__:download_one_day:22 - Analysis date: 2024-03-13
+2024-03-14 11:41:15.717 | INFO     | __main__:download_one_day:29 - # operators to run: 198
+2024-03-14 11:41:15.718 | INFO     | __main__:download_one_day:33 - *********** Download routelines data ***********
+2024-03-14 11:42:47.690 | INFO     | __main__:download_one_day:63 - execution time: 0:01:34.063605
+2024-03-14 11:43:04.973 | INFO     | __main__:download_one_day:21 - Analysis date: 2024-03-13
+2024-03-14 11:43:06.291 | INFO     | __main__:download_one_day:29 - # operators to run: 172
+2024-03-14 11:43:06.291 | INFO     | __main__:download_one_day:33 - *********** Download st data ***********
+2024-03-14 11:44:27.599 | INFO     | __main__:download_one_day:56 - execution time: 0:01:22.625555
diff --git a/gtfs_funnel/logs/download_vp_v2.log b/gtfs_funnel/logs/download_vp_v2.log
@@ -185,3 +185,25 @@
 2024-02-15 09:34:43.337 | INFO     | __main__:<module>:110 - export concatenated vp: 0:02:11.652166
 2024-02-15 09:37:09.512 | INFO     | __main__:<module>:132 - remove batched parquets
 2024-02-15 09:37:09.513 | INFO     | __main__:<module>:135 - execution time: 0:04:43.398413
+2024-03-14 09:09:58.720 | INFO     | __main__:<module>:148 - Analysis date: 2023-03-13
+2024-03-14 09:11:36.786 | INFO     | __main__:loop_through_batches_and_download_vp:111 - exported batch 0 to GCS: 0:01:38.029332
+2024-03-14 09:12:26.968 | INFO     | __main__:loop_through_batches_and_download_vp:111 - exported batch 1 to GCS: 0:00:50.181553
+2024-03-14 09:15:04.540 | INFO     | __main__:loop_through_batches_and_download_vp:111 - exported batch 2 to GCS: 0:02:37.570466
+2024-03-14 09:16:39.753 | INFO     | __main__:loop_through_batches_and_download_vp:111 - exported batch 3 to GCS: 0:01:35.211943
+2024-03-14 09:16:39.755 | INFO     | __main__:<module>:155 - execution time: 0:06:40.997879
+2024-03-14 09:16:57.331 | INFO     | __main__:<module>:95 - Analysis date: 2023-03-13
+2024-03-14 09:17:03.307 | INFO     | __main__:<module>:103 - concat and filter batched data: 0:00:05.975527
+2024-03-14 09:19:12.804 | INFO     | __main__:<module>:110 - export concatenated vp: 0:02:09.497151
+2024-03-14 09:21:46.029 | INFO     | __main__:<module>:132 - remove batched parquets
+2024-03-14 09:21:46.029 | INFO     | __main__:<module>:135 - execution time: 0:04:48.697741
+2024-03-14 11:44:47.535 | INFO     | __main__:<module>:148 - Analysis date: 2024-03-13
+2024-03-14 11:47:05.554 | INFO     | __main__:loop_through_batches_and_download_vp:111 - exported batch 0 to GCS: 0:02:18.016698
+2024-03-14 11:48:07.120 | INFO     | __main__:loop_through_batches_and_download_vp:111 - exported batch 1 to GCS: 0:01:01.565485
+2024-03-14 11:52:52.284 | INFO     | __main__:loop_through_batches_and_download_vp:111 - exported batch 2 to GCS: 0:04:45.163361
+2024-03-14 11:55:08.855 | INFO     | __main__:loop_through_batches_and_download_vp:111 - exported batch 3 to GCS: 0:02:16.569896
+2024-03-14 11:55:08.856 | INFO     | __main__:<module>:155 - execution time: 0:10:21.318927
+2024-03-14 11:55:28.243 | INFO     | __main__:<module>:95 - Analysis date: 2024-03-13
+2024-03-14 11:55:35.238 | INFO     | __main__:<module>:103 - concat and filter batched data: 0:00:06.994611
+2024-03-14 11:58:41.151 | INFO     | __main__:<module>:110 - export concatenated vp: 0:03:05.913001
+2024-03-14 12:01:43.033 | INFO     | __main__:<module>:132 - remove batched parquets
+2024-03-14 12:01:43.035 | INFO     | __main__:<module>:135 - execution time: 0:06:14.791580
diff --git a/gtfs_funnel/logs/vp_preprocessing.log b/gtfs_funnel/logs/vp_preprocessing.log
@@ -12,3 +12,11 @@
 2024-02-15 12:43:43.624 | INFO     | __main__:<module>:202 - 2024-02-14: vp_direction script execution time: 0:06:24.980603
 2024-02-15 12:50:35.377 | INFO     | __main__:<module>:142 - 2024-02-14: condense vp for trip-direction 0:06:37.853370
 2024-02-15 13:02:43.454 | INFO     | __main__:<module>:150 - 2024-02-14: prepare vp to use in nearest neighbor: 0:12:08.077021
+2024-03-14 12:08:15.749 | INFO     | __main__:<module>:169 - 2024-03-13: pare down vp: 0:01:39.120888
+2024-03-14 12:11:52.801 | INFO     | __main__:attach_prior_vp_add_direction:89 - persist vp gddf: 0:03:19.615961
+2024-03-14 12:14:59.645 | INFO     | __main__:attach_prior_vp_add_direction:121 - np vectorize arrays for direction: 0:03:06.843928
+2024-03-14 12:15:05.566 | INFO     | __main__:<module>:193 - 2024-03-13: export vp direction: 0:06:32.381100
+2024-03-14 12:16:08.741 | INFO     | __main__:<module>:199 - 2024-03-13: export usable vp with direction: 0:01:03.175027
+2024-03-14 12:16:08.742 | INFO     | __main__:<module>:202 - 2024-03-13: vp_direction script execution time: 0:07:35.556127
+2024-03-14 12:43:58.062 | INFO     | __main__:<module>:153 - 2024-03-13: condense vp for trip 0:04:45.267623
+2024-03-14 12:56:43.421 | INFO     | __main__:<module>:161 - 2024-03-13: prepare vp to use in nearest neighbor: 0:12:45.358549
diff --git a/gtfs_funnel/route_typologies.py b/gtfs_funnel/route_typologies.py
@@ -37,8 +37,7 @@ def assemble_scheduled_trip_metrics(
 
     time_of_day = (gtfs_schedule_wrangling.get_trip_time_buckets(analysis_date)   
                    [["trip_instance_key", "time_of_day", 
-                     "service_minutes"]]
-                   .rename(columns = {"service_minutes": "sched_service_min"})
+                     "scheduled_service_minutes"]]
               )
 
     trip_cols = ["schedule_gtfs_dataset_key", "trip_instance_key"]
@@ -81,15 +80,15 @@ def schedule_metrics_by_route_direction(
                       # take mean of the median stop spacing for trip
                       # does this make sense?
                       # median is the single boiled down metric at the trip-level
-                      "sched_service_min": "mean",
+                      "scheduled_service_minutes": "mean",
                   }).reset_index()
                   .rename(columns = {
                       "median_stop_meters": "avg_stop_meters",
-                      "sched_service_min": "avg_sched_service_min"
+                      "scheduled_service_minutes": "avg_scheduled_service_minutes"
                   })
                  )
 
-    round_me = ["avg_stop_meters", "avg_sched_service_min"]
+    round_me = ["avg_stop_meters", "avg_scheduled_service_minutes"]
     metrics_df[round_me] = metrics_df[round_me].round(2)
 
     common_shape = gtfs_schedule_wrangling.most_common_shape_by_route_direction(

diff --git a/gtfs_funnel/update_vars.py b/gtfs_funnel/update_vars.py
@@ -6,7 +6,7 @@
              rt_dates.oct_week + rt_dates.apr_week)
 
 analysis_date_list = [
-    rt_dates.DATES["feb2024"]
+    rt_dates.DATES["mar2024"]
 ] 
 
 CONFIG_PATH = Path("config.yml")

diff --git a/gtfs_funnel/vp_condenser.py b/gtfs_funnel/vp_condenser.py
@@ -1,7 +1,6 @@
 """
 Condense vp into arrays by trip-direction.
 """
-import dask.dataframe as dd
 import datetime
 import geopandas as gpd
 import pandas as pd
@@ -28,50 +27,32 @@ def condense_vp_to_linestring(
     USABLE_VP = dict_inputs["usable_vp_file"]
     EXPORT_FILE = dict_inputs["vp_condensed_line_file"]
 
-    vp = dd.read_parquet(
+    vp = delayed(pd.read_parquet)(
         f"{SEGMENT_GCS}{USABLE_VP}_{analysis_date}",
         columns = ["trip_instance_key", "x", "y", 
                    "vp_idx", "vp_primary_direction", 
                    "location_timestamp_local"
                   ],
     )
 
-    vp_dtypes = vp.drop(columns = ["x", "y"]).dtypes.to_dict()
-
-    vp_gdf = vp.map_partitions(
-        wrangle_shapes.vp_as_gdf,
-        crs = WGS84,
-        meta = {
-            **vp_dtypes,
-            "geometry": "geometry"
-        },
-        align_dataframes = True
-    )
-
-    vp_condensed = vp_gdf.map_partitions(
-        vp_transform.condense_point_geom_to_line,
+    vp_gdf = delayed(wrangle_shapes.vp_as_gdf)(vp, crs = WGS84)
+
+    vp_condensed = delayed(vp_transform.condense_point_geom_to_line)(
+        vp_gdf,
         group_cols = ["trip_instance_key"],
         geom_col = "geometry",
         other_cols = ["vp_idx", "location_timestamp_local", 
                       "vp_primary_direction"],
-        meta = {
-            "trip_instance_key": "object",
-            "geometry": "geometry",
-            "vp_idx": "object",
-            "location_timestamp_local": "object",
-            "vp_primary_direction": "object",
-        },
-        align_dataframes = False
-    ).compute().set_geometry("geometry").set_crs(WGS84)
+    ).set_geometry("geometry").set_crs(WGS84)
+
+    vp_condensed = compute(vp_condensed)[0]
 
     utils.geoparquet_gcs_export(
         vp_condensed,
         SEGMENT_GCS,
         f"{EXPORT_FILE}_{analysis_date}"
     )
-
-    del vp_condensed
-
+
     return 
 
 
@@ -100,7 +81,7 @@ def prepare_vp_for_all_directions(
             vp, direction) 
         for direction in wrangle_shapes.ALL_DIRECTIONS
     ]
-
+        
     results = [compute(i)[0] for i in dfs]
 
     gdf = pd.concat(
@@ -135,10 +116,11 @@ def prepare_vp_for_all_directions(
 
     for analysis_date in analysis_date_list:
         start = datetime.datetime.now()
-
+        
         condense_vp_to_linestring(analysis_date, CONFIG_DICT)
 
         time1 = datetime.datetime.now()
+
         logger.info(
             f"{analysis_date}: condense vp for trip "
             f"{time1 - start}"

diff --git a/gtfs_funnel/vp_direction.py b/gtfs_funnel/vp_direction.py
@@ -160,8 +160,8 @@ def add_direction_to_usable_vp(
     ).drop_duplicates(subset=["vp_idx", "vp_primary_direction"])   
 
     export_path = f"{SEGMENT_GCS}{INPUT_FILE}_{analysis_date}"
-    if fs.exists(export_path):
-        fs.rm(export_path, recursive=True)
+
+    helpers.if_exists_then_delete(export_path)
 
     vp_with_dir.to_parquet(
         export_path,

diff --git a/high_quality_transit_areas/B1_create_hqta_segments.py b/high_quality_transit_areas/B1_create_hqta_segments.py
@@ -105,8 +105,14 @@ def select_shapes_and_segment(
     Concatenate these 2 portions and then cut HQTA segments.
     Returns the hqta_segments for all the routes across all operators.
     """ 
+    # Only include certain Amtrak routes
+    outside_amtrak_shapes = gtfs_schedule_wrangling.amtrak_trips(
+        analysis_date, inside_ca = False).shape_array_key.unique()
+
     gdf = gtfs_schedule_wrangling.longest_shape_by_route_direction(
         analysis_date
+    ).query(
+        'shape_array_key not in @outside_ca_amtrak_shapes'
     ).drop(
         columns = ["schedule_gtfs_dataset_key", 
                    "shape_array_key", "route_length"]