Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Mar open data #1047

Merged
merged 6 commits into from
Mar 14, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions _shared_utils/shared_utils/rt_dates.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,7 @@
"dec2023": "2023-12-13",
"jan2024": "2024-01-17",
"feb2024": "2024-02-14",
"mar2024": "2024-03-13",
}

y2023_dates = [
Expand Down
24 changes: 10 additions & 14 deletions gtfs_digest/merge_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,18 +17,16 @@ def concatenate_schedule_by_route_direction(
Concatenate schedule data that's been
aggregated to route-direction-time_period.
"""
df = time_series_utils.concatenate_datasets_across_months(
df = time_series_utils.concatenate_datasets_across_dates(
RT_SCHED_GCS,
"schedule_route_dir/schedule_route_direction_metrics",
date_list,
data_type = "df",
columns = route_time_cols + [
"avg_sched_service_min",
"avg_sched_service_minutes",
"avg_stop_meters",
"n_trips", "frequency",]
).sort_values(sort_cols).reset_index(drop=True).rename(
columns = {"n_trips": "n_scheduled_trips"}
)
"n_scheduled_trips", "frequency"],
).sort_values(sort_cols).reset_index(drop=True)

return df

Expand All @@ -40,14 +38,14 @@ def concatenate_segment_speeds_by_route_direction(
Concatenate segment speeds data that's been
aggregated to route-direction-time_period.
"""
df = time_series_utils.concatenate_datasets_across_months(
df = time_series_utils.concatenate_datasets_across_dates(
SEGMENT_GCS,
"rollup_singleday/speeds_route_dir_segments",
date_list,
data_type = "gdf",
columns = route_time_cols + [
"stop_pair", "p20_mph", "p50_mph",
"p80_mph", "geometry"]
"p80_mph", "geometry"],
).sort_values(sort_cols).reset_index(drop=True)

return df
Expand All @@ -60,12 +58,12 @@ def concatenate_speeds_by_route_direction(
Concatenate rt vs schedule data that's been
aggregated to route-direction-time_period.
"""
df = time_series_utils.concatenate_datasets_across_months(
df = time_series_utils.concatenate_datasets_across_dates(
SEGMENT_GCS,
"rollup_singleday/speeds_route_dir",
date_list,
data_type = "df",
columns = route_time_cols + ["speed_mph"]
columns = route_time_cols + ["speed_mph"],
).sort_values(sort_cols).reset_index(drop=True)

return df
Expand All @@ -75,14 +73,12 @@ def concatenate_rt_vs_schedule_by_route_direction(
date_list: list
) -> pd.DataFrame:

df = time_series_utils.concatenate_datasets_across_months(
df = time_series_utils.concatenate_datasets_across_dates(
RT_SCHED_GCS,
"vp_route_dir/route_direction_metrics",
date_list,
data_type = "df",
).sort_values(sort_cols).reset_index(drop=True).rename(
columns = {"n_trips": "vp_trips"}
)
).sort_values(sort_cols).reset_index(drop=True)

# We'll add this back in after merging
# because these would be NaN if it's not in schedule
Expand Down
9 changes: 5 additions & 4 deletions gtfs_funnel/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -6,15 +6,16 @@ download_gtfs_data:
python download_stop_times.py
python download_vehicle_positions.py
python concatenate_vehicle_positions.py



preprocess:
python stop_times_with_direction.py
python route_typologies.py
python crosswalk_gtfs_dataset_key_to_organization.py
python vp_keep_usable.py
python vp_direction.py
python vp_condenser.py
python cleanup.py
python route_typologies.py
python crosswalk_gtfs_dataset_key_to_organization.py
python vp_condenser.py

# Start compiling scripts needed when we start concatenating several days
# Clean route names for displaying across time
Expand Down
32 changes: 32 additions & 0 deletions gtfs_funnel/logs/download_data.log
Original file line number Diff line number Diff line change
Expand Up @@ -286,3 +286,35 @@
2024-02-15 09:23:46.825 | INFO | __main__:download_one_day:29 - # operators to run: 169
2024-02-15 09:23:46.826 | INFO | __main__:download_one_day:33 - *********** Download st data ***********
2024-02-15 09:25:01.209 | INFO | __main__:download_one_day:56 - execution time: 0:01:15.946993
2024-03-14 09:04:12.795 | INFO | __main__:download_one_day:46 - Analysis date: 2023-03-13
2024-03-14 09:04:15.112 | INFO | __main__:download_one_day:53 - # operators to run: 197
2024-03-14 09:04:15.112 | INFO | __main__:download_one_day:56 - *********** Download trips data ***********
2024-03-14 09:04:39.494 | INFO | __main__:download_one_day:86 - execution time: 0:00:26.673402
2024-03-14 09:04:57.009 | INFO | __main__:download_one_day:23 - Analysis date: 2023-03-13
2024-03-14 09:04:58.829 | INFO | __main__:download_one_day:30 - # operators to run: 197
2024-03-14 09:04:58.830 | INFO | __main__:download_one_day:33 - *********** Download stops data ***********
2024-03-14 09:05:06.710 | INFO | __main__:download_one_day:64 - execution time: 0:00:09.700498
2024-03-14 09:05:23.556 | INFO | __main__:download_one_day:22 - Analysis date: 2023-03-13
2024-03-14 09:05:25.592 | INFO | __main__:download_one_day:29 - # operators to run: 197
2024-03-14 09:05:25.592 | INFO | __main__:download_one_day:33 - *********** Download routelines data ***********
2024-03-14 09:07:56.533 | INFO | __main__:download_one_day:63 - execution time: 0:02:32.976430
2024-03-14 09:08:13.702 | INFO | __main__:download_one_day:21 - Analysis date: 2023-03-13
2024-03-14 09:08:15.097 | INFO | __main__:download_one_day:29 - # operators to run: 155
2024-03-14 09:08:15.097 | INFO | __main__:download_one_day:33 - *********** Download st data ***********
2024-03-14 09:09:38.948 | INFO | __main__:download_one_day:56 - execution time: 0:01:25.245238
2024-03-14 11:40:02.601 | INFO | __main__:download_one_day:46 - Analysis date: 2024-03-13
2024-03-14 11:40:04.888 | INFO | __main__:download_one_day:53 - # operators to run: 198
2024-03-14 11:40:04.888 | INFO | __main__:download_one_day:56 - *********** Download trips data ***********
2024-03-14 11:40:27.756 | INFO | __main__:download_one_day:86 - execution time: 0:00:25.154163
2024-03-14 11:40:45.783 | INFO | __main__:download_one_day:23 - Analysis date: 2024-03-13
2024-03-14 11:40:47.912 | INFO | __main__:download_one_day:30 - # operators to run: 198
2024-03-14 11:40:47.913 | INFO | __main__:download_one_day:33 - *********** Download stops data ***********
2024-03-14 11:40:55.873 | INFO | __main__:download_one_day:64 - execution time: 0:00:10.088939
2024-03-14 11:41:13.626 | INFO | __main__:download_one_day:22 - Analysis date: 2024-03-13
2024-03-14 11:41:15.717 | INFO | __main__:download_one_day:29 - # operators to run: 198
2024-03-14 11:41:15.718 | INFO | __main__:download_one_day:33 - *********** Download routelines data ***********
2024-03-14 11:42:47.690 | INFO | __main__:download_one_day:63 - execution time: 0:01:34.063605
2024-03-14 11:43:04.973 | INFO | __main__:download_one_day:21 - Analysis date: 2024-03-13
2024-03-14 11:43:06.291 | INFO | __main__:download_one_day:29 - # operators to run: 172
2024-03-14 11:43:06.291 | INFO | __main__:download_one_day:33 - *********** Download st data ***********
2024-03-14 11:44:27.599 | INFO | __main__:download_one_day:56 - execution time: 0:01:22.625555
22 changes: 22 additions & 0 deletions gtfs_funnel/logs/download_vp_v2.log
Original file line number Diff line number Diff line change
Expand Up @@ -185,3 +185,25 @@
2024-02-15 09:34:43.337 | INFO | __main__:<module>:110 - export concatenated vp: 0:02:11.652166
2024-02-15 09:37:09.512 | INFO | __main__:<module>:132 - remove batched parquets
2024-02-15 09:37:09.513 | INFO | __main__:<module>:135 - execution time: 0:04:43.398413
2024-03-14 09:09:58.720 | INFO | __main__:<module>:148 - Analysis date: 2023-03-13
2024-03-14 09:11:36.786 | INFO | __main__:loop_through_batches_and_download_vp:111 - exported batch 0 to GCS: 0:01:38.029332
2024-03-14 09:12:26.968 | INFO | __main__:loop_through_batches_and_download_vp:111 - exported batch 1 to GCS: 0:00:50.181553
2024-03-14 09:15:04.540 | INFO | __main__:loop_through_batches_and_download_vp:111 - exported batch 2 to GCS: 0:02:37.570466
2024-03-14 09:16:39.753 | INFO | __main__:loop_through_batches_and_download_vp:111 - exported batch 3 to GCS: 0:01:35.211943
2024-03-14 09:16:39.755 | INFO | __main__:<module>:155 - execution time: 0:06:40.997879
2024-03-14 09:16:57.331 | INFO | __main__:<module>:95 - Analysis date: 2023-03-13
2024-03-14 09:17:03.307 | INFO | __main__:<module>:103 - concat and filter batched data: 0:00:05.975527
2024-03-14 09:19:12.804 | INFO | __main__:<module>:110 - export concatenated vp: 0:02:09.497151
2024-03-14 09:21:46.029 | INFO | __main__:<module>:132 - remove batched parquets
2024-03-14 09:21:46.029 | INFO | __main__:<module>:135 - execution time: 0:04:48.697741
2024-03-14 11:44:47.535 | INFO | __main__:<module>:148 - Analysis date: 2024-03-13
2024-03-14 11:47:05.554 | INFO | __main__:loop_through_batches_and_download_vp:111 - exported batch 0 to GCS: 0:02:18.016698
2024-03-14 11:48:07.120 | INFO | __main__:loop_through_batches_and_download_vp:111 - exported batch 1 to GCS: 0:01:01.565485
2024-03-14 11:52:52.284 | INFO | __main__:loop_through_batches_and_download_vp:111 - exported batch 2 to GCS: 0:04:45.163361
2024-03-14 11:55:08.855 | INFO | __main__:loop_through_batches_and_download_vp:111 - exported batch 3 to GCS: 0:02:16.569896
2024-03-14 11:55:08.856 | INFO | __main__:<module>:155 - execution time: 0:10:21.318927
2024-03-14 11:55:28.243 | INFO | __main__:<module>:95 - Analysis date: 2024-03-13
2024-03-14 11:55:35.238 | INFO | __main__:<module>:103 - concat and filter batched data: 0:00:06.994611
2024-03-14 11:58:41.151 | INFO | __main__:<module>:110 - export concatenated vp: 0:03:05.913001
2024-03-14 12:01:43.033 | INFO | __main__:<module>:132 - remove batched parquets
2024-03-14 12:01:43.035 | INFO | __main__:<module>:135 - execution time: 0:06:14.791580
8 changes: 8 additions & 0 deletions gtfs_funnel/logs/vp_preprocessing.log
Original file line number Diff line number Diff line change
Expand Up @@ -12,3 +12,11 @@
2024-02-15 12:43:43.624 | INFO | __main__:<module>:202 - 2024-02-14: vp_direction script execution time: 0:06:24.980603
2024-02-15 12:50:35.377 | INFO | __main__:<module>:142 - 2024-02-14: condense vp for trip-direction 0:06:37.853370
2024-02-15 13:02:43.454 | INFO | __main__:<module>:150 - 2024-02-14: prepare vp to use in nearest neighbor: 0:12:08.077021
2024-03-14 12:08:15.749 | INFO | __main__:<module>:169 - 2024-03-13: pare down vp: 0:01:39.120888
2024-03-14 12:11:52.801 | INFO | __main__:attach_prior_vp_add_direction:89 - persist vp gddf: 0:03:19.615961
2024-03-14 12:14:59.645 | INFO | __main__:attach_prior_vp_add_direction:121 - np vectorize arrays for direction: 0:03:06.843928
2024-03-14 12:15:05.566 | INFO | __main__:<module>:193 - 2024-03-13: export vp direction: 0:06:32.381100
2024-03-14 12:16:08.741 | INFO | __main__:<module>:199 - 2024-03-13: export usable vp with direction: 0:01:03.175027
2024-03-14 12:16:08.742 | INFO | __main__:<module>:202 - 2024-03-13: vp_direction script execution time: 0:07:35.556127
2024-03-14 12:43:58.062 | INFO | __main__:<module>:153 - 2024-03-13: condense vp for trip 0:04:45.267623
2024-03-14 12:56:43.421 | INFO | __main__:<module>:161 - 2024-03-13: prepare vp to use in nearest neighbor: 0:12:45.358549
9 changes: 4 additions & 5 deletions gtfs_funnel/route_typologies.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,8 +37,7 @@ def assemble_scheduled_trip_metrics(

time_of_day = (gtfs_schedule_wrangling.get_trip_time_buckets(analysis_date)
[["trip_instance_key", "time_of_day",
"service_minutes"]]
.rename(columns = {"service_minutes": "sched_service_min"})
"scheduled_service_minutes"]]
)

trip_cols = ["schedule_gtfs_dataset_key", "trip_instance_key"]
Expand Down Expand Up @@ -81,15 +80,15 @@ def schedule_metrics_by_route_direction(
# take mean of the median stop spacing for trip
# does this make sense?
# median is the single boiled down metric at the trip-level
"sched_service_min": "mean",
"scheduled_service_minutes": "mean",
}).reset_index()
.rename(columns = {
"median_stop_meters": "avg_stop_meters",
"sched_service_min": "avg_sched_service_min"
"scheduled_service_minutes": "avg_scheduled_service_minutes"
})
)

round_me = ["avg_stop_meters", "avg_sched_service_min"]
round_me = ["avg_stop_meters", "avg_scheduled_service_minutes"]
metrics_df[round_me] = metrics_df[round_me].round(2)

common_shape = gtfs_schedule_wrangling.most_common_shape_by_route_direction(
Expand Down
2 changes: 1 addition & 1 deletion gtfs_funnel/update_vars.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
rt_dates.oct_week + rt_dates.apr_week)

analysis_date_list = [
rt_dates.DATES["feb2024"]
rt_dates.DATES["mar2024"]
]

CONFIG_PATH = Path("config.yml")
Expand Down
42 changes: 12 additions & 30 deletions gtfs_funnel/vp_condenser.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
"""
Condense vp into arrays by trip-direction.
"""
import dask.dataframe as dd
import datetime
import geopandas as gpd
import pandas as pd
Expand All @@ -28,50 +27,32 @@ def condense_vp_to_linestring(
USABLE_VP = dict_inputs["usable_vp_file"]
EXPORT_FILE = dict_inputs["vp_condensed_line_file"]

vp = dd.read_parquet(
vp = delayed(pd.read_parquet)(
f"{SEGMENT_GCS}{USABLE_VP}_{analysis_date}",
columns = ["trip_instance_key", "x", "y",
"vp_idx", "vp_primary_direction",
"location_timestamp_local"
],
)

vp_dtypes = vp.drop(columns = ["x", "y"]).dtypes.to_dict()

vp_gdf = vp.map_partitions(
wrangle_shapes.vp_as_gdf,
crs = WGS84,
meta = {
**vp_dtypes,
"geometry": "geometry"
},
align_dataframes = True
)

vp_condensed = vp_gdf.map_partitions(
vp_transform.condense_point_geom_to_line,
vp_gdf = delayed(wrangle_shapes.vp_as_gdf)(vp, crs = WGS84)

vp_condensed = delayed(vp_transform.condense_point_geom_to_line)(
vp_gdf,
group_cols = ["trip_instance_key"],
geom_col = "geometry",
other_cols = ["vp_idx", "location_timestamp_local",
"vp_primary_direction"],
meta = {
"trip_instance_key": "object",
"geometry": "geometry",
"vp_idx": "object",
"location_timestamp_local": "object",
"vp_primary_direction": "object",
},
align_dataframes = False
).compute().set_geometry("geometry").set_crs(WGS84)
).set_geometry("geometry").set_crs(WGS84)

vp_condensed = compute(vp_condensed)[0]

utils.geoparquet_gcs_export(
vp_condensed,
SEGMENT_GCS,
f"{EXPORT_FILE}_{analysis_date}"
)

del vp_condensed


return


Expand Down Expand Up @@ -100,7 +81,7 @@ def prepare_vp_for_all_directions(
vp, direction)
for direction in wrangle_shapes.ALL_DIRECTIONS
]

results = [compute(i)[0] for i in dfs]

gdf = pd.concat(
Expand Down Expand Up @@ -135,10 +116,11 @@ def prepare_vp_for_all_directions(

for analysis_date in analysis_date_list:
start = datetime.datetime.now()

condense_vp_to_linestring(analysis_date, CONFIG_DICT)

time1 = datetime.datetime.now()

logger.info(
f"{analysis_date}: condense vp for trip "
f"{time1 - start}"
Expand Down
4 changes: 2 additions & 2 deletions gtfs_funnel/vp_direction.py
Original file line number Diff line number Diff line change
Expand Up @@ -160,8 +160,8 @@ def add_direction_to_usable_vp(
).drop_duplicates(subset=["vp_idx", "vp_primary_direction"])

export_path = f"{SEGMENT_GCS}{INPUT_FILE}_{analysis_date}"
if fs.exists(export_path):
fs.rm(export_path, recursive=True)

helpers.if_exists_then_delete(export_path)

vp_with_dir.to_parquet(
export_path,
Expand Down
6 changes: 6 additions & 0 deletions high_quality_transit_areas/B1_create_hqta_segments.py
Original file line number Diff line number Diff line change
Expand Up @@ -105,8 +105,14 @@ def select_shapes_and_segment(
Concatenate these 2 portions and then cut HQTA segments.
Returns the hqta_segments for all the routes across all operators.
"""
# Only include certain Amtrak routes
outside_amtrak_shapes = gtfs_schedule_wrangling.amtrak_trips(
analysis_date, inside_ca = False).shape_array_key.unique()

gdf = gtfs_schedule_wrangling.longest_shape_by_route_direction(
analysis_date
).query(
'shape_array_key not in @outside_ca_amtrak_shapes'
).drop(
columns = ["schedule_gtfs_dataset_key",
"shape_array_key", "route_length"]
Expand Down
Loading
Loading