Skip to content

Commit

Permalink
Merge pull request #827 from cal-itp/clean-up-segment-speed-utils
Browse files Browse the repository at this point in the history
Quarterly metrics / clean up segment_speed_utils and bus_service_utils
  • Loading branch information
tiffanychu90 authored Aug 4, 2023
2 parents 0419179 + 88f67c6 commit ef8d7c4
Show file tree
Hide file tree
Showing 29 changed files with 846 additions and 1,199 deletions.
1 change: 1 addition & 0 deletions _shared_utils/shared_utils/rt_dates.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,4 +34,5 @@
"Q4_2022": "2022-10-12",
"Q1_2023": "2023-01-18",
"Q2_2023": "2023-04-12",
"Q3_2023": "2023-07-12",
}
Original file line number Diff line number Diff line change
Expand Up @@ -235,7 +235,7 @@ def generate_calenviroscreen_lehd_data(
# Merge together
final = merge_calenviroscreen_lehd(gdf, lehd)

if GCS is True:
if GCS:
shared_utils.utils.geoparquet_gcs_export(
final,
utils.GCS_FILE_PATH,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -144,7 +144,7 @@ def overlay_transit_to_highways(
transit_routes,
highways,
how = "intersection",
keep_geom_type = False
keep_geom_type = True
)

# Using new geometry column, calculate what % that intersection
Expand Down
82 changes: 0 additions & 82 deletions bus_service_increase/bus_service_utils/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,9 +6,6 @@
import pandas as pd

from calendar import THURSDAY, SATURDAY, SUNDAY
from calitp_data.storage import get_fs

fs = get_fs()

GCS_PROJECT = "cal-itp-data-infra"
BUCKET_NAME = "calitp-analytics-data"
Expand All @@ -17,34 +14,6 @@

DATA_PATH = "./data/"
IMG_PATH = "./img/"


def import_csv_export_parquet(dataset_name: str, output_file_name: str,
GCS_FILE_PATH: str, GCS: bool = True):
"""
DATASET_NAME: str. Name of csv dataset.
OUTPUT_FILE_NAME: str. Name of output parquet dataset.
GCS_FILE_PATH: str. Ex: gs://calitp-analytics-data/data-analyses/my-folder/
"""
df = pd.read_csv(f"{dataset_name}.csv")

if GCS is True:
df.to_parquet(f"{GCS_FILE_PATH}{output_file_name}.parquet")
else:
df.to_parquet(f"./{output_file_name}.parquet")


def import_export(DATASET_NAME: str, OUTPUT_FILE_NAME: str, GCS:bool=True):
"""
DATASET_NAME: str. Name of csv dataset.
OUTPUT_FILE_NAME: str. Name of output parquet dataset.
"""
df = pd.read_csv(f"{DATASET_NAME}.csv")

if GCS is True:
df.to_parquet(f"{GCS_FILE_PATH}{OUTPUT_FILE_NAME}.parquet")
else:
df.to_parquet(f"./{OUTPUT_FILE_NAME}.parquet")


def get_recent_dates() -> dict:
Expand All @@ -63,57 +32,6 @@ def get_recent_dates() -> dict:
return dict(zip(['thurs', 'sat', 'sun'], dates))


# There are multiple feeds, with different trip_keys but same trip_ids
# Only keep calitp_url_number == 0 EXCEPT LA Metro
def include_exclude_multiple_feeds(df: pd.DataFrame,
id_col: str = "itp_id",
include_ids: list = [182],
exclude_ids: list = [200]) -> pd.DataFrame:
"""
df: pandas.DataFrame.
id_col: str, column name for calitp_itp_id, such as "itp_id"
include_ids: list,
list of itp_ids that are allowed to have multiple feeds
(Ex: LA Metro)
exclude_ids: list, list of itp_ids to drop. (Ex: MTC, regional feed)
"""
# If there are explicit regional feeds to drop, put that in exclude_ids
group_cols = list(df.columns)
dup_cols = [i for i in group_cols if i != "calitp_url_number"]

df2 = (df[~df[id_col].isin(exclude_ids)]
.sort_values(group_cols)
.drop_duplicates(subset=dup_cols)
.reset_index(drop=True)
)

print(f"# obs in original df: {len(df)}")
print(f"# obs in new df: {len(df2)}")

# There are still multiple operators here
# But, seems like for those trip_ids, they are different values
# between url_number==0 vs url_number==1
multiple_urls = list(df2[df2.calitp_url_number==1][id_col].unique())
print(f"These operators have multiple calitp_url_number values: {multiple_urls}")

return df2


def fix_gtfs_time(gtfs_timestring: str) -> str:
'''Reformats a GTFS timestamp (which allows the hour to exceed 24 to
mark service day continuity)
to standard 24-hour time.
'''
split = gtfs_timestring.split(':')
hour = int(split[0])
if hour >= 24:
split[0] = str(hour - 24)
corrected = (':').join(split)
return corrected.strip()
else:
return gtfs_timestring.strip()


# https://stackoverflow.com/questions/25052980/use-pickle-to-save-dictionary-in-python
def save_request_json(my_list: list,
name: str,
Expand Down
2 changes: 1 addition & 1 deletion bus_service_increase/setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
setup(
name="bus_service_utils",
packages=find_packages(),
version="1.0",
version="2.0",
description="Shared utility functions for bus service data analyses",
author="Cal-ITP",
license="Apache",
Expand Down
10 changes: 6 additions & 4 deletions open_data/create_routes_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@

import prep_traffic_ops
from shared_utils import utils, geography_utils, portfolio_utils
from segment_speed_utils import helpers, gtfs_schedule_wrangling
from segment_speed_utils import helpers
from update_vars import analysis_date, TRAFFIC_OPS_GCS


Expand All @@ -32,12 +32,13 @@ def create_routes_file_for_export(date: str) -> gpd.GeoDataFrame:
crs = geography_utils.WGS84
).dropna(subset="shape_array_key")

df = gtfs_schedule_wrangling.merge_shapes_to_trips(
df = pd.merge(
shapes,
trips,
merge_cols = ["shape_array_key"]
on = "shape_array_key",
how = "inner"
).drop(columns = "trip_id").drop_duplicates(subset="shape_array_key")

df2 = remove_erroneous_shapes(df)

drop_cols = ["route_short_name", "route_long_name", "route_desc"]
Expand Down Expand Up @@ -83,6 +84,7 @@ def remove_erroneous_shapes(

return ok_shapes


def finalize_export_df(df: gpd.GeoDataFrame) -> gpd.GeoDataFrame:
"""
Suppress certain columns used in our internal modeling for export.
Expand Down
8 changes: 5 additions & 3 deletions open_data/create_stops_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@

import prep_traffic_ops
from shared_utils import utils, geography_utils, schedule_rt_utils
from segment_speed_utils import helpers, gtfs_schedule_wrangling
from segment_speed_utils import helpers
from update_vars import analysis_date, TRAFFIC_OPS_GCS


Expand Down Expand Up @@ -44,9 +44,11 @@ def attach_route_info_to_stops(
.reset_index(drop=True)
).compute()

stops_with_geom = gtfs_schedule_wrangling.attach_stop_geometry(
stops_with_geom = dd.merge(
stops,
stops_with_route_info,
stops
on = ["feed_key", "stop_id"],
how = "inner"
)

stops_assembled = (stops_with_geom
Expand Down
2 changes: 1 addition & 1 deletion open_data/download_trips.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ def get_operators(analysis_date: str):
keep_cols = None,
get_df = True,
feed_option = "use_subfeeds"
).rename(columns = {"_gtfs_dataset_name": "name"})
).rename(columns = {"gtfs_dataset_name": "name"})

keep_cols = ["feed_key", "name"]

Expand Down
Git LFS file not shown
Git LFS file not shown
15 changes: 8 additions & 7 deletions portfolio/quarterly_performance_metrics/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -14,18 +14,19 @@ Initially presented for the Planning and Modal Advisory Committee (PMAC).
## Workflow
### Data Generation

1. [Generate processed data for categories and service hours](https://github.com/cal-itp/data-analyses/blob/main/quarterly_performance_objective/A1_generate_routes_on_shn_data.py) with GTFS schedule data
1. [Categorize routes into 3 groups](https://github.com/cal-itp/data-analyses/blob/main/quarterly_performance_objective/A2_categorize_routes.py)
1. [Generate endpoint data processed data](https://github.com/cal-itp/data-analyses/blob/main/quarterly_performance_objective/A3_generate_endpoint_delay.py) with GTFS real-time data
1. [Merge service hours and endpoint delay](https://github.com/cal-itp/data-analyses/blob/main/quarterly_performance_objective/A4_route_service_hours_delay.py)
1. [Aggregate from shape level to route level](https://github.com/cal-itp/data-analyses/blob/main/quarterly_performance_objective/A1_scheduled_route_level_df)
1. [Generate processed data for categories and service hours](https://github.com/cal-itp/data-analyses/blob/main/quarterly_performance_objective/A2_generate_routes_on_shn_data.py) with GTFS schedule data
1. [Categorize routes into 3 groups](https://github.com/cal-itp/data-analyses/blob/main/quarterly_performance_objective/A3_categorize_routes.py)
1. [Generate endpoint data processed data](https://github.com/cal-itp/data-analyses/blob/main/quarterly_performance_objective/B1_generate_endpoint_delay.py) with GTFS real-time data
1. [Merge service hours and endpoint delay](https://github.com/cal-itp/data-analyses/blob/main/quarterly_performance_objective/B2_route_service_hours_delay.py)

### Helper Scripts for Reports
1. [data prep functions](https://github.com/cal-itp/data-analyses/blob/main/quarterly_performance_objective/B1_report_metrics.py)
1. [chart functions](https://github.com/cal-itp/data-analyses/blob/main/quarterly_performance_objective/B2_report_charts.py)
1. [data prep functions](https://github.com/cal-itp/data-analyses/blob/main/quarterly_performance_objective/C1_report_metrics.py)
1. [chart functions](https://github.com/cal-itp/data-analyses/blob/main/quarterly_performance_objective/C2_report_charts.py)

### Reports

Create a report of current quarter's snapshot as well as a historical comparison of quarterly metrics report.

1. [current quarter's snapshot](https://github.com/cal-itp/data-analyses/blob/main/quarterly_performance_objective/current_quarter_report.ipynb)
1. [historical comparison](https://github.com/cal-itp/data-analyses/blob/main/quarterly_performance_objective/historical_report.ipynb)
1. [historical comparison](https://github.com/cal-itp/data-analyses/blob/main/quarterly_performance_objective/historical_service_hours_v2.ipynb)
2 changes: 1 addition & 1 deletion portfolio/quarterly_performance_metrics/_config.yml
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@ html:
use_issues_button: true
use_repository_button: true
use_edit_page_button: true
google_analytics_id: G-SZB618VNBZ
google_analytics_id: 'G-JCX3Z8JZJC'

sphinx:
config:
Expand Down
Loading

0 comments on commit ef8d7c4

Please sign in to comment.