diff --git a/_shared_utils/requirements.txt b/_shared_utils/requirements.txt index d6e1198b1..cd61b8687 100644 --- a/_shared_utils/requirements.txt +++ b/_shared_utils/requirements.txt @@ -1 +1,2 @@ -e . +gcsfs diff --git a/_shared_utils/setup.py b/_shared_utils/setup.py index 9d148588f..699b04eb1 100644 --- a/_shared_utils/setup.py +++ b/_shared_utils/setup.py @@ -3,7 +3,7 @@ setup( name="shared_utils", packages=find_packages(), - version="2.1.0", + version="2.2.0", description="Shared utility functions for data analyses", author="Cal-ITP", license="Apache", diff --git a/_shared_utils/shared_utils/rt_utils.py b/_shared_utils/shared_utils/rt_utils.py index 05e94d78f..b01871c43 100644 --- a/_shared_utils/shared_utils/rt_utils.py +++ b/_shared_utils/shared_utils/rt_utils.py @@ -15,14 +15,11 @@ import numpy as np import pandas as pd import shapely -from calitp_data.storage import get_fs from calitp_data_analysis.tables import tbls from numba import jit from shared_utils import geography_utils, gtfs_utils_v2, rt_dates, utils from siuba import * -fs = get_fs() - # set system time os.environ["TZ"] = "America/Los_Angeles" time.tzset() diff --git a/_shared_utils/shared_utils/styleguide.py b/_shared_utils/shared_utils/styleguide.py index 1a8cc0d2e..8ff7ce7d5 100644 --- a/_shared_utils/shared_utils/styleguide.py +++ b/_shared_utils/shared_utils/styleguide.py @@ -25,7 +25,6 @@ """ import altair as alt -from plotnine import * from shared_utils import calitp_color_palette as cp # --------------------------------------------------------------# @@ -60,10 +59,6 @@ """ -# --------------------------------------------------------------# -# Altair -# --------------------------------------------------------------# - def calitp_theme( font: str = font, @@ -254,34 +249,3 @@ def preset_chart_config(chart: alt.Chart) -> alt.Chart: ) return chart - - -# --------------------------------------------------------------# -# Plotnine -# --------------------------------------------------------------# -def preset_plotnine_config(chart): - chart = ( - chart - + theme_538() - + theme( - plot_background=element_rect(fill=backgroundColor, color=backgroundColor), - panel_background=element_rect(fill=backgroundColor, color=backgroundColor), - panel_grid_major_y=element_line(color=axisColor, linetype="solid", size=1), - panel_grid_major_x=element_blank(), - figure_size=(7.0, 4.4), - title=element_text(weight="bold", size=font_size, family=font, color=blackTitle), - axis_title=element_text(family=labelFont, size=12, color=guideTitleColor), - axis_text=element_text(family=labelFont, size=10, color=guideLabelColor, margin={"r": 4}), - axis_title_x=element_text(margin={"t": 10}), - axis_title_y=element_text(margin={"r": 10}), - legend_title=element_text(font=labelFont, size=14, color=blackTitle, margin={"b": 10}), - legend_text=element_text( - font=labelFont, - size=11, - color=blackTitle, - margin={"t": 5, "b": 5, "r": 5, "l": 5}, - ), - ) - ) - - return chart diff --git a/_shared_utils/shared_utils/utils.py b/_shared_utils/shared_utils/utils.py index d9c77b443..174f71bb6 100644 --- a/_shared_utils/shared_utils/utils.py +++ b/_shared_utils/shared_utils/utils.py @@ -9,11 +9,11 @@ import dask_geopandas as dg import fsspec +import gcsfs import geopandas as gpd import requests -from calitp_data.storage import get_fs -fs = get_fs() +fs = gcsfs.GCSFileSystem() def sanitize_file_path(file_name: str) -> str: diff --git a/rt_segment_speeds/segment_speed_utils/array_utils.py b/rt_segment_speeds/segment_speed_utils/array_utils.py index c68c9e2a3..43cced315 100644 --- a/rt_segment_speeds/segment_speed_utils/array_utils.py +++ b/rt_segment_speeds/segment_speed_utils/array_utils.py @@ -1,8 +1,8 @@ +""" +Functions for working with numpy arrays. +""" import numpy as np -#from numba import jit - -#@jit(parallel=True) def get_index(array: np.ndarray, item) -> int: """ Find the index for a certain value in an array. @@ -13,12 +13,12 @@ def get_index(array: np.ndarray, item) -> int: return idx[0] -#@jit(parallel=True) def subset_array_by_indices( array: np.ndarray, start_end_tuple: tuple ): """ + Subset an array using index positions. """ lower_idx = start_end_tuple[0] upper_idx = start_end_tuple[-1] + 1 diff --git a/rt_segment_speeds/segment_speed_utils/gtfs_schedule_wrangling.py b/rt_segment_speeds/segment_speed_utils/gtfs_schedule_wrangling.py index 5a5949cd7..462cc7505 100644 --- a/rt_segment_speeds/segment_speed_utils/gtfs_schedule_wrangling.py +++ b/rt_segment_speeds/segment_speed_utils/gtfs_schedule_wrangling.py @@ -70,7 +70,7 @@ def get_trips_with_geom( shapes = helpers.import_scheduled_shapes( analysis_date, columns = ["shape_array_key", "geometry"], - get_pandas = False, + get_pandas = True, ) trips = helpers.import_scheduled_trips( diff --git a/rt_segment_speeds/segment_speed_utils/sched_rt_utils.py b/rt_segment_speeds/segment_speed_utils/sched_rt_utils.py index 9595a058d..2ae6c26fa 100644 --- a/rt_segment_speeds/segment_speed_utils/sched_rt_utils.py +++ b/rt_segment_speeds/segment_speed_utils/sched_rt_utils.py @@ -1,74 +1,20 @@ """ Functions for bridging schedule and RT data. -From RT data, gtfs_dataset_key is used. -From schedule data, feed_key is used. -These functions start with schedule data and add the RT gtfs_dataset_key. +RT and schedule trips are joined using trip_instance_key. +https://github.com/cal-itp/data-infra/pull/2489 + """ import dask_geopandas as dg import dask.dataframe as dd import geopandas as gpd import pandas as pd -from typing import List, Literal - -from shared_utils import schedule_rt_utils, rt_utils +from shared_utils import rt_utils from segment_speed_utils import helpers from segment_speed_utils.project_vars import COMPILED_CACHED_VIEWS, PROJECT_CRS -def crosswalk_scheduled_trip_grouping_with_rt_key( - analysis_date: str, - keep_trip_cols: list = ["feed_key", "trip_id"], - feed_types: List[Literal["vehicle_positions", - "trip_updates", - "service_alerts"]] = ["vehicle_positions"], - **kwargs -) -> pd.DataFrame: - """ - Filter scheduled trips to a certain grouping - (with route_id, direction_id or shape_array_key), - and merge in gtfs_dataset_key that comes from fct_rt_feeds. - - This is our crosswalk that we can stick in the middle of vp or segments - and that allows us to get feed_key and gtfs_dataset_key - """ - trips = helpers.import_scheduled_trips( - analysis_date, - columns = keep_trip_cols, - **kwargs - ) - - # Get the schedule feed_key and RT gtfs_dataset_key and add it to crosswalk - fct_rt_feeds = (schedule_rt_utils.get_rt_schedule_feeds_crosswalk( - analysis_date, - keep_cols = ["gtfs_dataset_key", "schedule_feed_key", "feed_type"], - get_df = True, - custom_filtering = {"feed_type": feed_types} - ).rename(columns = {"schedule_feed_key": "feed_key"}) - .drop(columns = "feed_type") - ) - - # Merge trips with fct_rt_feeds to get gtfs_dataset_key - if isinstance(trips, dd.DataFrame): - trips_with_rt_key = dd.merge( - trips, - fct_rt_feeds, - on = "feed_key", - how = "inner" - ) - - else: - trips_with_rt_key = pd.merge( - trips, - fct_rt_feeds, - on = "feed_key", - how = "inner" - ) - - return trips_with_rt_key - - def get_trip_time_buckets(analysis_date: str) -> pd.DataFrame: """ Assign trips to time-of-day.