diff --git a/gtfs_funnel/vp_condenser.py b/gtfs_funnel/vp_condenser.py index a16e908dd..99d7d4cb6 100644 --- a/gtfs_funnel/vp_condenser.py +++ b/gtfs_funnel/vp_condenser.py @@ -24,25 +24,26 @@ def condense_vp_to_linestring( We will group by trip and save out the vp point geom into a shapely.LineString. """ - USABLE_VP = dict_inputs.speeds_tables.usable_vp - EXPORT_FILE = dict_inputs.speeds_tables.vp_condensed_line + USABLE_VP = dict_inputs.speeds_tables.usable_vp + "_with_dwell" + EXPORT_FILE = dict_inputs.speeds_tables.vp_condensed_line + "_dwell" vp = delayed(pd.read_parquet)( f"{SEGMENT_GCS}{USABLE_VP}_{analysis_date}", columns = ["trip_instance_key", "x", "y", "vp_idx", "vp_primary_direction", - "location_timestamp_local" + "location_timestamp_local", + "moving_timestamp_local", ], - ) - - vp_gdf = delayed(wrangle_shapes.vp_as_gdf)(vp, crs = WGS84) - + ).pipe(wrangle_shapes.vp_as_gdf, crs = WGS84) + vp_condensed = delayed(vp_transform.condense_point_geom_to_line)( - vp_gdf, + vp, group_cols = ["trip_instance_key"], geom_col = "geometry", other_cols = ["vp_idx", "location_timestamp_local", - "vp_primary_direction"], + "moving_timestamp_local", + "vp_primary_direction", + ], ).set_geometry("geometry").set_crs(WGS84) vp_condensed = compute(vp_condensed)[0] @@ -69,8 +70,8 @@ def prepare_vp_for_all_directions( Subset vp_idx, location_timestamp_local and coordinate arrays to exclude southbound. """ - INPUT_FILE = dict_inputs.speeds_tables.vp_condensed_line - EXPORT_FILE = dict_inputs.speeds_tables.vp_nearest_neighbor + INPUT_FILE = dict_inputs.speeds_tables.vp_condensed_line + "_dwell" + EXPORT_FILE = dict_inputs.speeds_tables.vp_nearest_neighbor + "_dwell" vp = delayed(gpd.read_parquet)( f"{SEGMENT_GCS}{INPUT_FILE}_{analysis_date}.parquet", @@ -109,11 +110,12 @@ def prepare_vp_for_all_directions( format="{time:YYYY-MM-DD at HH:mm:ss} | {level} | {message}", level="INFO") + from shared_utils import rt_dates - for analysis_date in analysis_date_list: + for analysis_date in [rt_dates.DATES["apr2024"]]:#analysis_date_list: start = datetime.datetime.now() - condense_vp_to_linestring(analysis_date, GTFS_DATA_DICT) + #condense_vp_to_linestring(analysis_date, GTFS_DATA_DICT) time1 = datetime.datetime.now() diff --git a/gtfs_funnel/vp_dwell_time.py b/gtfs_funnel/vp_dwell_time.py new file mode 100644 index 000000000..cd9c8bb7b --- /dev/null +++ b/gtfs_funnel/vp_dwell_time.py @@ -0,0 +1,227 @@ +""" +Add dwell time to vp +""" +import datetime +import pandas as pd +import sys + +from dask import delayed, compute +from loguru import logger + +from segment_speed_utils import helpers, segment_calcs +from segment_speed_utils.project_vars import SEGMENT_GCS + +def import_vp(analysis_date: str) -> pd.DataFrame: + """ + Import vehicle positions with a subset of columns + we need to check whether bus is dwelling + at a location. + """ + vp = pd.read_parquet( + f"{SEGMENT_GCS}vp_usable_{analysis_date}", + columns = [ + "trip_instance_key", "vp_idx", + "location_timestamp_local", "vp_primary_direction" + ], + ) + + return vp + + +def group_vp_dwelling_rows(df: pd.DataFrame) -> pd.DataFrame: + """ + We do not know how many vp have repeated positions consecutively, + but we want to consolidate as many as we can until it moves to + the next position. + + We know that the prior position has a vp index of vp_idx - 1. + If it's not that, then it's moved on. + This is important because buses can revisit a stop in a loop route, + and it can stop at a plaza, go on elsewhere, and come back to a plaza, + and we don't want to mistakenly group non-consecutive vp. + """ + df = df.assign( + #prior_expected = df.vp_idx - 1, + prior = (df.sort_values(["trip_instance_key", "vp_idx"]) + .groupby("trip_instance_key", observed=True, group_keys=False) + .vp_idx + .apply(lambda x: x.shift(1)) + ) + ) + + + df = df.assign( + # flag whether it is moving (we want 0's to show up for dwelling vp + # because this will get the cumcount() to work + is_moving = df.apply( + lambda x: + 0 if x.prior == x.prior_expected + else 1, axis=1).astype("int8") + ) + + return df + + +def split_into_moving_and_dwelling(vp: pd.DataFrame): + """ + Use vp_primary_direction to split vp into either moving vp or dwelling vp. + Dwelling vp need extra transforms to figure how long it dwelled. + It's unknown if there was no movement, because the x, y is the + same, so direction was not able to be calculated. + The only exception is the first vp, because there is no prior point against which + to calculate direction. + """ + usable_bounds = segment_calcs.get_usable_vp_bounds_by_trip( + vp + ).drop(columns = "max_vp_idx") + + vp2 = pd.merge( + vp, + usable_bounds, + on = "trip_instance_key", + how = "inner" + ) + + vp2 = vp2.assign( + prior_expected = vp2.vp_idx - 1, + ) + + # keep subset of prior vp when we have unknowns, + #then we want to grab just the one above + subset_vp_prior = vp2[ + vp2.vp_primary_direction=="Unknown" + ].prior_expected.unique().tolist() + + subset_unknown_vp = vp2[ + vp2.vp_primary_direction=="Unknown" + ].vp_idx.unique().tolist() + + # These vp have unknowns and may need to consolidate + # leave first vp in, just in case the second vp is unknown + vp_unknowns = vp2.loc[ + vp2.vp_idx.isin(subset_vp_prior + subset_unknown_vp) + ] + + # Vast majority of vp should be here, and we want to + # separate these out because no change is happening + # and we don't want to do an expensive row-wise shift on these + vp_knowns = vp2.loc[~vp2.vp_idx.isin(subset_vp_prior + subset_unknown_vp)] + + vp_unknowns2 = group_vp_dwelling_rows(vp_unknowns) + + vp3 = pd.concat( + [vp_knowns, vp_unknowns2], + axis=0, ignore_index=True + ).drop( + columns = ["prior", "prior_expected"] + ).fillna( + {"is_moving": 1} + ).astype( + {"is_moving": "int8"} + ).sort_values("vp_idx").reset_index(drop=True) + + vp3 = vp3.assign( + # since is_moving=0 if the vp is dwelling, + # cumsum() will not change from the prior vp + # and a set of 2 or 3 will hold the same vp_grouping value + # once the vp moves and is_moving=1, then cumsum() will increase again + vp_grouping = (vp3.groupby("trip_instance_key", + observed=True, group_keys=False) + .is_moving + .cumsum() + ) + ) + + return vp3 + + +def add_dwell_time( + vp_grouped: pd.DataFrame, +) -> pd.DataFrame: + """ + Take vp that have their groups flagged and + add dwell time (in seconds). Dwell time is calculated + for this vp_location, which may not necessarily be a bus stop. + """ + group_cols = ["trip_instance_key", "vp_grouping"] + + start_vp = (vp_grouped + .groupby(group_cols, observed=True, group_keys=False) + .agg({ + "vp_idx": "min", + "location_timestamp_local": "min", + "vp_primary_direction": "count" + }).reset_index() + .rename(columns = {"vp_primary_direction": "n_vp_at_location"}) + ) + + end_vp = (vp_grouped + .groupby(group_cols, observed=True, group_keys=False) + .agg({ + "vp_idx": "max", + "location_timestamp_local": "max" + }).reset_index() + .rename(columns = { + "vp_idx": "end_vp_idx", + "location_timestamp_local": "moving_timestamp_local" + }) + ) + + df = pd.merge( + start_vp, + end_vp, + on = group_cols, + how = "inner" + ) + + df = df.assign( + dwell_sec = (df.moving_timestamp_local - + df.location_timestamp_local).dt.total_seconds().astype("int") + ) + + return df + +if __name__ == "__main__": + + LOG_FILE = "./logs/vp_preprocessing.log" + logger.add(LOG_FILE, retention="3 months") + logger.add(sys.stderr, + format="{time:YYYY-MM-DD at HH:mm:ss} | {level} | {message}", + level="INFO") + + analysis_date = "2024-04-17" + + start = datetime.datetime.now() + + vp = delayed(import_vp)(analysis_date) + + vp_grouped = delayed(split_into_moving_and_dwelling)(vp) + + vp_with_dwell = delayed(add_dwell_time)(vp_grouped) + + vp_with_dwell = compute(vp_with_dwell)[0] + + time1 = datetime.datetime.now() + logger.info(f"compute dwell df: {time1 - start}") + + vp_usable = pd.read_parquet( + f"{SEGMENT_GCS}vp_usable_{analysis_date}", + ) + + vp_usable_with_dwell = pd.merge( + vp_usable, + vp_with_dwell, + on = ["trip_instance_key", "vp_idx", "location_timestamp_local"], + how = "inner" + ) + + helpers.if_exists_then_delete(f"{SEGMENT_GCS}vp_usable_with_dwell_{analysis_date}") + + vp_usable_with_dwell.to_parquet( + f"{SEGMENT_GCS}vp_usable_with_dwell_{analysis_date}", + partition_cols = "gtfs_dataset_key", + ) + + end = datetime.datetime.now() + logger.info(f"merge with original and export: {end - time1}") + logger.info(f"vp with dwell time: {end - start}") \ No newline at end of file diff --git a/rt_segment_speeds/37_bbb_speeds_with_dwell.ipynb b/rt_segment_speeds/37_bbb_speeds_with_dwell.ipynb new file mode 100644 index 000000000..0b28a2061 --- /dev/null +++ b/rt_segment_speeds/37_bbb_speeds_with_dwell.ipynb @@ -0,0 +1,763 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "fe5aafc6-e153-4727-a26c-cfbf6ea892c4", + "metadata": {}, + "source": [ + "# Big Blue Bus speeds using `vp_usable_with_dwell`\n", + "\n", + "## Alignments\n", + "* Align nearest neighbor and interpolation steps\n", + "* Dwell times are factored in\n", + "* Interpolation of stop arrival looks at the previous `moving_timestamp_local` which is after the bus gets moving again, and this stop's `location_timestamp_local`, which is the beginning of the bus's dwelling at a stop.\n", + "\n", + "## Speed\n", + "* Retain how speed is calculated - even remove filters (in averaging, speeds over 80 mph are filtered out) to show full distribution\n", + "* In the full distribution, we'll keep extra high speeds in, and decide on where are high/low filters should be" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "id": "667c4b7f-f402-4a03-9ea6-77bfbdf84ca1", + "metadata": {}, + "outputs": [], + "source": [ + "import folium\n", + "import geopandas as gpd\n", + "import pandas as pd\n", + "\n", + "from segment_speed_utils.project_vars import SEGMENT_GCS\n", + "from shared_utils import rt_dates, rt_utils\n", + "\n", + "analysis_date = rt_dates.DATES[\"apr2024\"]" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "id": "197232ec-c256-4aee-92d4-97815558e1d2", + "metadata": {}, + "outputs": [], + "source": [ + "speeds = pd.read_parquet(\n", + " f\"{SEGMENT_GCS}bbb_speeds_by_trip_{analysis_date}.parquet\"\n", + ")\n", + " \n", + "segment_speeds = gpd.read_parquet(\n", + " f\"{SEGMENT_GCS}bbb_segment_speeds_gdf_{analysis_date}.parquet\"\n", + ")" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "id": "40514b39-dcd7-4fd9-8b37-6ef7bd9242d8", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "count 32161.000000\n", + "mean 14.200993\n", + "std 8.277521\n", + "min 0.033953\n", + "25% 8.890265\n", + "50% 12.699928\n", + "75% 17.480291\n", + "max 79.641062\n", + "Name: speed_mph, dtype: float64" + ] + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "speeds[speeds.speed_mph <= 80].speed_mph.describe()" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "id": "2f446c73-2623-4c40-ba2d-db20ea9a6068", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "" + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + }, + { + "data": { + "image/png": "iVBORw0KGgoAAAANSUhEUgAAAjAAAAGdCAYAAAAMm0nCAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjUuMCwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8/fFQqAAAACXBIWXMAAA9hAAAPYQGoP6dpAAAnBklEQVR4nO3de3TU5Z3H8c/kMhMiJOFibjUJUSt3kIYSpl6qEhIwa71w9kilipXK0Q1dMS0KXiBIbRBbr8vKcauwewpF3KNUCYWMQcBLAMmactFFtChWSehKIYTIZMw8+4cnU8ckQGAmk2fyfp0zB+f5PfPM880vnXz6/C7jMMYYAQAAWCQm0hMAAADoLAIMAACwDgEGAABYhwADAACsQ4ABAADWIcAAAADrEGAAAIB1CDAAAMA6cZGeQLj4/X59/vnn6tOnjxwOR6SnAwAAToMxRseOHVNmZqZiYjpeZ4naAPP5558rKysr0tMAAABn4NNPP9V5553X4faoDTB9+vSR9PUPICkpKWTj+nw+VVZWqrCwUPHx8SEbtzvpCTVKPaNOaowO1BgdqPH0NDQ0KCsrK/B3vCNRG2BaDxslJSWFPMAkJiYqKSkpqn8Bo71GqWfUSY3RgRqjAzV2zqlO/+AkXgAAYB0CDAAAsA4BBgAAWIcAAwAArEOAAQAA1iHAAAAA6xBgAACAdQgwAADAOgQYAABgHQIMAACwDgEGAABYhwADAACsQ4ABAADWIcAAAADrxEV6Aui+hpdtkLcl+OvMP15UHKHZAADwD6zAAAAA6xBgAACAdQgwAADAOpwDg5AYOKei3XbOmQEAhAMrMAAAwDoEGAAAYB0CDAAAsA4BBgAAWIcAAwAArMNVSOiUjq42AgCgK7ECAwAArEOAAQAA1iHAAAAA6xBgAACAdQgwAADAOgQYAABgHQIMAACwDgEGAABYhwADAACsw5140ebuuq5Yo8VjIzQZAABOAyswAADAOp0KMOXl5fr+97+vPn36KDU1Vdddd5327t0b1OfEiRMqKSlR//791bt3b02ePFn19fVBfQ4cOKDi4mIlJiYqNTVVs2fP1ldffRXUZ9OmTfre974nl8ulCy+8UMuXLz+zCgEAQNTpVIDZvHmzSkpKtHXrVnk8Hvl8PhUWFur48eOBPnfffbdeffVVvfjii9q8ebM+//xz3XDDDYHtLS0tKi4uVnNzs95++23953/+p5YvX6558+YF+uzfv1/FxcW68sorVVtbq1mzZulnP/uZNmzYEIKSAQCA7Tp1Dsz69euDni9fvlypqamqqanR5ZdfrqNHj+q5557TypUrddVVV0mSli1bpiFDhmjr1q0aN26cKisr9d577+m1115TWlqaLr74Yi1cuFD33nuvysrK5HQ6tXTpUuXm5uq3v/2tJGnIkCF688039fjjj6uoqChEpQMAAFud1Um8R48elST169dPklRTUyOfz6eCgoJAn8GDBys7O1vV1dUaN26cqqurNWLECKWlpQX6FBUV6c4779SePXs0evRoVVdXB43R2mfWrFkdzsXr9crr9QaeNzQ0SJJ8Pp98Pt/ZlBmkdaxQjhlprlgT/DzGBP17Nrrzzyka9+W3UWN0oMboQI2dG+NUzjjA+P1+zZo1S5dccomGDx8uSaqrq5PT6VRKSkpQ37S0NNXV1QX6fDO8tG5v3XayPg0NDfryyy/Vq1evNvMpLy/XggUL2rRXVlYqMTHxzIo8CY/HE/IxI6WjK44WjvGf9djr1q076zHCLZr2ZUeoMTpQY3SgxpNramo6rX5nHGBKSkq0e/duvfnmm2c6REjNnTtXpaWlgecNDQ3KyspSYWGhkpKSQvY+Pp9PHo9HEyZMUHx8fMjGjaThZcHnFrlijBaO8evBHTHy+h1nNfbusu57yC8a9+W3UWN0oMboQI2np/UIyqmcUYCZOXOm1q5dqy1btui8884LtKenp6u5uVlHjhwJWoWpr69Xenp6oM/27duDxmu9Sumbfb595VJ9fb2SkpLaXX2RJJfLJZfL1aY9Pj4+LL8o4Ro3Erwt7YcUr9/R4bbT9d0HK9u0fbyo+KzGDLVo2pcdocboQI3RgRpP/drT0amrkIwxmjlzpl5++WVt3LhRubm5Qdvz8vIUHx+vqqqqQNvevXt14MABud1uSZLb7dauXbt06NChQB+Px6OkpCQNHTo00OebY7T2aR0DAAD0bJ1agSkpKdHKlSv1xz/+UX369Amcs5KcnKxevXopOTlZ06dPV2lpqfr166ekpCT9/Oc/l9vt1rhx4yRJhYWFGjp0qG6++WYtXrxYdXV1euCBB1RSUhJYQbnjjjv0b//2b7rnnnt02223aePGjVq9erUqKio6nBsAAOg5OrUC88wzz+jo0aO64oorlJGREXi88MILgT6PP/64/umf/kmTJ0/W5ZdfrvT0dL300kuB7bGxsVq7dq1iY2Pldrv1k5/8RLfccoseeuihQJ/c3FxVVFTI4/Fo1KhR+u1vf6vf/e53XEINAAAkdXIFxphTX1abkJCgJUuWaMmSJR32ycnJOeXVKVdccYXefffdzkwPAAD0EHwXEgAAsA4BBgAAWIcAAwAArEOAAQAA1iHAAAAA6xBgAACAdQgwAADAOgQYAABgHQIMAACwDgEGAABYhwADAACsQ4ABAADW6dSXOcJuA+dURHoKAACEBCswAADAOgQYAABgHQ4hoct1dCjr40XFXTwTAICtWIEBAADWIcAAAADrEGAAAIB1CDAAAMA6BBgAAGAdAgwAALAOAQYAAFiHAAMAAKxDgAEAANYhwAAAAOsQYAAAgHUIMAAAwDoEGAAAYB0CDAAAsA4BBgAAWIcAAwAArEOAAQAA1omL9AQQegPnVER6CgAAhBUrMAAAwDoEGAAAYB0CDAAAsA4BBgAAWIcAAwAArEOAAQAA1iHAAAAA6xBgAACAdQgwAADAOgQYAABgHQIMAACwDgEGAABYhwADAACsQ4ABAADWIcAAAADrEGAAAIB1CDAAAMA6BBgAAGAdAgwAALAOAQYAAFiHAAMAAKxDgAEAANYhwAAAAOsQYAAAgHUIMAAAwDoEGAAAYB0CDAAAsA4BBgAAWIcAAwAArEOAAQAA1iHAAAAA6xBgAACAdQgwAADAOgQYAABgHQIMAACwTlykJwC0Gjinot32jxcVd/FMAADdXadXYLZs2aJrrrlGmZmZcjgcWrNmTdD2W2+9VQ6HI+gxceLEoD6HDx/W1KlTlZSUpJSUFE2fPl2NjY1BfXbu3KnLLrtMCQkJysrK0uLFiztfHQAAiEqdDjDHjx/XqFGjtGTJkg77TJw4UQcPHgw8/vCHPwRtnzp1qvbs2SOPx6O1a9dqy5YtmjFjRmB7Q0ODCgsLlZOTo5qaGj366KMqKyvTs88+29npAgCAKNTpQ0iTJk3SpEmTTtrH5XIpPT293W3vv/++1q9fr3feeUdjxoyRJD399NO6+uqr9Zvf/EaZmZlasWKFmpub9fzzz8vpdGrYsGGqra3VY489FhR00PFhFwAAollYzoHZtGmTUlNT1bdvX1111VX61a9+pf79+0uSqqurlZKSEggvklRQUKCYmBht27ZN119/vaqrq3X55ZfL6XQG+hQVFemRRx7R3//+d/Xt27fNe3q9Xnm93sDzhoYGSZLP55PP5wtZba1jhXLMs+GKNaEfM8YE/Rtp4fpZd7d9GQ7UGB2oMTpQY+fGOJWQB5iJEyfqhhtuUG5urj766CPdd999mjRpkqqrqxUbG6u6ujqlpqYGTyIuTv369VNdXZ0kqa6uTrm5uUF90tLSAtvaCzDl5eVasGBBm/bKykolJiaGqrwAj8cT8jHPxOKx4Rt74Rh/+AbvhHXr1oV1/O6yL8OJGqMDNUYHajy5pqam0+oX8gAzZcqUwH+PGDFCI0eO1AUXXKBNmzZp/PjxoX67gLlz56q0tDTwvKGhQVlZWSosLFRSUlLI3sfn88nj8WjChAmKj48P2bhnanjZhpCP6YoxWjjGrwd3xMjrd4R8/M7aXVYUlnG7274MB2qMDtQYHajx9LQeQTmVsF9Gff7552vAgAH68MMPNX78eKWnp+vQoUNBfb766isdPnw4cN5Menq66uvrg/q0Pu/o3BqXyyWXy9WmPT4+Piy/KOEat7O8LeELGF6/I6zjn65w/5y7y74MJ2qMDtQYHajx1K89HWG/kd1f//pXffHFF8rIyJAkud1uHTlyRDU1NYE+GzdulN/vV35+fqDPli1bgo6DeTweDRo0qN3DRwAAoGfpdIBpbGxUbW2tamtrJUn79+9XbW2tDhw4oMbGRs2ePVtbt27Vxx9/rKqqKl177bW68MILVVT09WGAIUOGaOLEibr99tu1fft2vfXWW5o5c6amTJmizMxMSdJNN90kp9Op6dOna8+ePXrhhRf05JNPBh0iAgAAPVenA8yOHTs0evRojR49WpJUWlqq0aNHa968eYqNjdXOnTv1ox/9SBdddJGmT5+uvLw8vfHGG0GHd1asWKHBgwdr/Pjxuvrqq3XppZcG3eMlOTlZlZWV2r9/v/Ly8vSLX/xC8+bN4xJqAAAg6QzOgbniiitkTMeX127YcOqTSvv166eVK1eetM/IkSP1xhtvdHZ6AACgB+DLHAEAgHUIMAAAwDoEGAAAYB0CDAAAsA4BBgAAWIcAAwAArEOAAQAA1iHAAAAA6xBgAACAdQgwAADAOgQYAABgHQIMAACwDgEGAABYhwADAACsExfpCQCnMnBORZu2jxcVR2AmAIDughUYAABgHQIMAACwDgEGAABYhwADAACsQ4ABAADWIcAAAADrEGAAAIB1CDAAAMA6BBgAAGAdAgwAALAOXyUAK7X39QISXzEAAD0FKzAAAMA6BBgAAGAdAgwAALAOAQYAAFiHAAMAAKxDgAEAANYhwAAAAOsQYAAAgHUIMAAAwDoEGAAAYB0CDAAAsA4BBgAAWIcAAwAArEOAAQAA1omL9ARwegbOqYj0FAAA6DZYgQEAANYhwAAAAOsQYAAAgHUIMAAAwDoEGAAAYB0CDAAAsA4BBgAAWIcAAwAArEOAAQAA1iHAAAAA6xBgAACAdQgwAADAOgQYAABgHQIMAACwDgEGAABYhwADAACsQ4ABAADWIcAAAADrEGAAAIB1CDAAAMA6BBgAAGAdAgwAALBOXKQnAITSwDkV7bZ/vKi4i2cCAAgnVmAAAIB1CDAAAMA6BBgAAGAdAgwAALAOAQYAAFiHAAMAAKxDgAEAANbpdIDZsmWLrrnmGmVmZsrhcGjNmjVB240xmjdvnjIyMtSrVy8VFBRo3759QX0OHz6sqVOnKikpSSkpKZo+fboaGxuD+uzcuVOXXXaZEhISlJWVpcWLF3e+OgAAEJU6HWCOHz+uUaNGacmSJe1uX7x4sZ566iktXbpU27Zt0znnnKOioiKdOHEi0Gfq1Knas2ePPB6P1q5dqy1btmjGjBmB7Q0NDSosLFROTo5qamr06KOPqqysTM8+++wZlAgAAKJNp+/EO2nSJE2aNKndbcYYPfHEE3rggQd07bXXSpL+67/+S2lpaVqzZo2mTJmi999/X+vXr9c777yjMWPGSJKefvppXX311frNb36jzMxMrVixQs3NzXr++efldDo1bNgw1dbW6rHHHgsKOgAAoGcK6VcJ7N+/X3V1dSooKAi0JScnKz8/X9XV1ZoyZYqqq6uVkpISCC+SVFBQoJiYGG3btk3XX3+9qqurdfnll8vpdAb6FBUV6ZFHHtHf//539e3bt817e71eeb3ewPOGhgZJks/nk8/nC1mNrWOFcszT4Yo1XfdeMSbo32jQ3v6K1L7sStQYHagxOlBj58Y4lZAGmLq6OklSWlpaUHtaWlpgW11dnVJTU4MnERenfv36BfXJzc1tM0brtvYCTHl5uRYsWNCmvbKyUomJiWdYUcc8Hk/IxzyZxWO79O0kSQvH+Lv+TcNk3bp1HW7r6n0ZCdQYHagxOlDjyTU1NZ1Wv6j5Mse5c+eqtLQ08LyhoUFZWVkqLCxUUlJSyN7H5/PJ4/FowoQJio+PD9m4pzK8bEOXvZcrxmjhGL8e3BEjr9/RZe8bTrvLitq0RWpfdiVqjA7UGB2o8fS0HkE5lZAGmPT0dElSfX29MjIyAu319fW6+OKLA30OHToU9LqvvvpKhw8fDrw+PT1d9fX1QX1an7f2+TaXyyWXy9WmPT4+Piy/KOEatyPelq4PEl6/IyLvGw4n21ddvS8jgRqjAzVGB2o89WtPR0jvA5Obm6v09HRVVVUF2hoaGrRt2za53W5Jktvt1pEjR1RTUxPos3HjRvn9fuXn5wf6bNmyJeg4mMfj0aBBg9o9fAQAAHqWTgeYxsZG1dbWqra2VtLXJ+7W1tbqwIEDcjgcmjVrln71q1/plVde0a5du3TLLbcoMzNT1113nSRpyJAhmjhxom6//XZt375db731lmbOnKkpU6YoMzNTknTTTTfJ6XRq+vTp2rNnj1544QU9+eSTQYeIAABAz9XpQ0g7duzQlVdeGXjeGiqmTZum5cuX65577tHx48c1Y8YMHTlyRJdeeqnWr1+vhISEwGtWrFihmTNnavz48YqJidHkyZP11FNPBbYnJyersrJSJSUlysvL04ABAzRv3jwuoQYAAJLOIMBcccUVMqbjy2sdDoceeughPfTQQx326devn1auXHnS9xk5cqTeeOONzk4PAAD0AHwXEgAAsA4BBgAAWIcAAwAArEOAAQAA1iHAAAAA6xBgAACAdQgwAADAOgQYAABgHQIMAACwDgEGAABYhwADAACsQ4ABAADW6fSXOSK8Bs6piPQUAADo9liBAQAA1iHAAAAA63AICT1Ce4fmXLFGi8dGYDIAgLPGCgwAALAOAQYAAFiHAAMAAKxDgAEAANYhwAAAAOsQYAAAgHUIMAAAwDoEGAAAYB0CDAAAsA534kWPN7xsg7wtjsDzjxcVR3A2AIDTwQoMAACwDgEGAABYhwADAACsQ4ABAADWIcAAAADrEGAAAIB1CDAAAMA6BBgAAGAdAgwAALAOAQYAAFiHAAMAAKxDgAEAANYhwAAAAOsQYAAAgHUIMAAAwDoEGAAAYB0CDAAAsA4BBgAAWIcAAwAArEOAAQAA1iHAAAAA6xBgAACAdQgwAADAOnGRngDQ3QycU9Fu+8eLirt4JgCAjrACAwAArEOAAQAA1iHAAAAA6xBgAACAdQgwAADAOgQYAABgHQIMAACwDgEGAABYhwADAACsQ4ABAADWIcAAAADrEGAAAIB1CDAAAMA6BBgAAGAdAgwAALAOAQYAAFiHAAMAAKwTF+kJALYYOKei3faPFxV38UwAAKzAAAAA6xBgAACAdQgwAADAOiEPMGVlZXI4HEGPwYMHB7afOHFCJSUl6t+/v3r37q3Jkyervr4+aIwDBw6ouLhYiYmJSk1N1ezZs/XVV1+FeqoAAMBSYTmJd9iwYXrttdf+8SZx/3ibu+++WxUVFXrxxReVnJysmTNn6oYbbtBbb70lSWppaVFxcbHS09P19ttv6+DBg7rlllsUHx+vX//61+GYLgAAsExYAkxcXJzS09PbtB89elTPPfecVq5cqauuukqStGzZMg0ZMkRbt27VuHHjVFlZqffee0+vvfaa0tLSdPHFF2vhwoW69957VVZWJqfTGY4pR0RHV7UAAICTC0uA2bdvnzIzM5WQkCC3263y8nJlZ2erpqZGPp9PBQUFgb6DBw9Wdna2qqurNW7cOFVXV2vEiBFKS0sL9CkqKtKdd96pPXv2aPTo0e2+p9frldfrDTxvaGiQJPl8Pvl8vpDV1jpWKMZ0xZqzHiMcXDEm6N9oFao6Q/n7FWqh/H3trqgxOlBjdAhFjaf7WocxJqR/pf70pz+psbFRgwYN0sGDB7VgwQJ99tln2r17t1599VX99Kc/DQoakjR27FhdeeWVeuSRRzRjxgx98skn2rBhQ2B7U1OTzjnnHK1bt06TJk1q933Lysq0YMGCNu0rV65UYmJiKEsEAABh0tTUpJtuuklHjx5VUlJSh/1CvgLzzYAxcuRI5efnKycnR6tXr1avXr1C/XYBc+fOVWlpaeB5Q0ODsrKyVFhYeNIfQGf5fD55PB5NmDBB8fHxZzXW8LINp+4UAa4Yo4Vj/HpwR4y8fkekpxM2oapzd1lRCGcVWqH8fe2uqDE6UGN0CEWNrUdQTiXsd+JNSUnRRRddpA8//FATJkxQc3Ozjhw5opSUlECf+vr6wDkz6enp2r59e9AYrVcptXdeTSuXyyWXy9WmPT4+Piy/KKEY19vSvcOB1+/o9nMMhbOt87sPVrZp62535w3X/w66E2qMDtQYHc6mxtN9XdjvA9PY2KiPPvpIGRkZysvLU3x8vKqqqgLb9+7dqwMHDsjtdkuS3G63du3apUOHDgX6eDweJSUlaejQoeGeLgAAsEDIV2B++ctf6pprrlFOTo4+//xzzZ8/X7Gxsfrxj3+s5ORkTZ8+XaWlperXr5+SkpL085//XG63W+PGjZMkFRYWaujQobr55pu1ePFi1dXV6YEHHlBJSUm7KywAAKDnCXmA+etf/6of//jH+uKLL3Tuuefq0ksv1datW3XuuedKkh5//HHFxMRo8uTJ8nq9Kioq0r//+78HXh8bG6u1a9fqzjvvlNvt1jnnnKNp06bpoYceCvVUAQCApUIeYFatWnXS7QkJCVqyZImWLFnSYZ+cnBytW7cu1FMDAABRgu9CAgAA1iHAAAAA6xBgAACAdQgwAADAOgQYAABgHQIMAACwDgEGAABYhwADAACsE/YvcwR6ooFzKtpt725f8ggAtmIFBgAAWIcAAwAArEOAAQAA1iHAAAAA6xBgAACAdQgwAADAOgQYAABgHQIMAACwDjeyA7oQN7gDgNBgBQYAAFiHAAMAAKxDgAEAANYhwAAAAOsQYAAAgHUIMAAAwDpcRg10A1xeDQCdQ4DpAh39cQIAAGeGQ0gAAMA6BBgAAGAdAgwAALAOAQYAAFiHAAMAAKxDgAEAANYhwAAAAOsQYAAAgHW4kR3QjbV3E0TuzgsArMAAAAALEWAAAIB1CDAAAMA6BBgAAGAdAgwAALAOAQYAAFiHAAMAAKzDfWAAy7R3bxiJ+8MA6FkIMECUaC/YuGKNFo+NwGQAIMw4hAQAAKxDgAEAANYhwAAAAOsQYAAAgHUIMAAAwDoEGAAAYB0CDAAAsA73gQF6gOFlG+RtcQSec9M7ALYjwIRQR3dIBQAAocUhJAAAYB1WYIAeiO9TAmA7VmAAAIB1CDAAAMA6HEICEMChJQC2YAUGAABYhwADAACswyEkAKfU3qElDisBiCQCDIAzEqobNxKEAJwJDiEBAADrEGAAAIB1CDAAAMA6nANzhr797b4AAKDrsAIDAACswwoMgIjiEm0AZ4IAAyAqfPuwLiEIiG4EGADWaG+1xhVrtHhseMaWCEJAd9WtA8ySJUv06KOPqq6uTqNGjdLTTz+tsWND8EkFoFsLxU3yQnWjPQDdU7cNMC+88IJKS0u1dOlS5efn64knnlBRUZH27t2r1NTUSE8PQA/R1efonGyVicNk/8CKGbptgHnsscd0++2366c//akkaenSpaqoqNDzzz+vOXPmRHh2AHqyzq7utPdHtTutMoXrj34kQkZnfiaEHbt1ywDT3NysmpoazZ07N9AWExOjgoICVVdXt/sar9crr9cbeH706FFJ0uHDh+Xz+UI2N5/Pp6amJsX5YtTij877wMT5jZqa/FFdo9Qz6qTG7uHCX65u09aZD99w19je/EKhoxrbez9XjNEDo/26+P6X5D2NGkPxxytcdUvStrnj27S1/v043RrbG+Nk8surznqMs9Va4xdffKH4+PgzGuPYsWOSJGPMyTuabuizzz4zkszbb78d1D579mwzduzYdl8zf/58I4kHDx48ePDgEQWPTz/99KRZoVuuwJyJuXPnqrS0NPDc7/fr8OHD6t+/vxyO0P0/loaGBmVlZenTTz9VUlJSyMbtTnpCjVLPqJMaowM1RgdqPD3GGB07dkyZmZkn7dctA8yAAQMUGxur+vr6oPb6+nqlp6e3+xqXyyWXyxXUlpKSEq4pKikpKWp/AVv1hBqlnlEnNUYHaowO1HhqycnJp+zTLb9KwOl0Ki8vT1VV/zie5/f7VVVVJbfbHcGZAQCA7qBbrsBIUmlpqaZNm6YxY8Zo7NixeuKJJ3T8+PHAVUkAAKDn6rYB5sYbb9Tf/vY3zZs3T3V1dbr44ou1fv16paWlRXReLpdL8+fPb3O4Kpr0hBqlnlEnNUYHaowO1BhaDmNOdZ0SAABA99Itz4EBAAA4GQIMAACwDgEGAABYhwADAACsQ4DppCVLlmjgwIFKSEhQfn6+tm/fHukpnbEtW7bommuuUWZmphwOh9asWRO03RijefPmKSMjQ7169VJBQYH27dsXmcmeofLycn3/+99Xnz59lJqaquuuu0579+4N6nPixAmVlJSof//+6t27tyZPntzmJord2TPPPKORI0cGbhzldrv1pz/9KbDd9vras2jRIjkcDs2aNSvQZnudZWVlcjgcQY/BgwcHttteX6vPPvtMP/nJT9S/f3/16tVLI0aM0I4dOwLbbf/cGThwYJv96HA4VFJSIik69mNLS4sefPBB5ebmqlevXrrgggu0cOHCoO8u6pL9ePbfXNRzrFq1yjidTvP888+bPXv2mNtvv92kpKSY+vr6SE/tjKxbt87cf//95qWXXjKSzMsvvxy0fdGiRSY5OdmsWbPG/PnPfzY/+tGPTG5urvnyyy8jM+EzUFRUZJYtW2Z2795tamtrzdVXX22ys7NNY2NjoM8dd9xhsrKyTFVVldmxY4cZN26c+cEPfhDBWXfOK6+8YioqKswHH3xg9u7da+677z4THx9vdu/ebYyxv75v2759uxk4cKAZOXKkueuuuwLtttc5f/58M2zYMHPw4MHA429/+1tgu+31GWPM4cOHTU5Ojrn11lvNtm3bzF/+8hezYcMG8+GHHwb62P65c+jQoaB96PF4jCTz+uuvG2OiYz8+/PDDpn///mbt2rVm//795sUXXzS9e/c2Tz75ZKBPV+xHAkwnjB071pSUlASet7S0mMzMTFNeXh7BWYXGtwOM3+836enp5tFHHw20HTlyxLhcLvOHP/whAjMMjUOHDhlJZvPmzcaYr2uKj483L774YqDP+++/bySZ6urqSE3zrPXt29f87ne/i7r6jh07Zr773e8aj8djfvjDHwYCTDTUOX/+fDNq1Kh2t0VDfcYYc++995pLL720w+3R+Llz1113mQsuuMD4/f6o2Y/FxcXmtttuC2q74YYbzNSpU40xXbcfOYR0mpqbm1VTU6OCgoJAW0xMjAoKClRdXR3BmYXH/v37VVdXF1RvcnKy8vPzra736NGjkqR+/fpJkmpqauTz+YLqHDx4sLKzs62ss6WlRatWrdLx48fldrujrr6SkhIVFxcH1SNFz37ct2+fMjMzdf7552vq1Kk6cOCApOip75VXXtGYMWP0z//8z0pNTdXo0aP1H//xH4Ht0fa509zcrN///ve67bbb5HA4omY//uAHP1BVVZU++OADSdKf//xnvfnmm5o0aZKkrtuP3fZOvN3N//3f/6mlpaXNnYDT0tL0v//7vxGaVfjU1dVJUrv1tm6zjd/v16xZs3TJJZdo+PDhkr6u0+l0tvniT9vq3LVrl9xut06cOKHevXvr5Zdf1tChQ1VbWxsV9UnSqlWr9D//8z9655132myLhv2Yn5+v5cuXa9CgQTp48KAWLFigyy67TLt3746K+iTpL3/5i5555hmVlpbqvvvu0zvvvKN//dd/ldPp1LRp06Luc2fNmjU6cuSIbr31VknR8XsqSXPmzFFDQ4MGDx6s2NhYtbS06OGHH9bUqVMldd3fDwIMeoySkhLt3r1bb775ZqSnEnKDBg1SbW2tjh49qv/+7//WtGnTtHnz5khPK2Q+/fRT3XXXXfJ4PEpISIj0dMKi9f+9StLIkSOVn5+vnJwcrV69Wr169YrgzELH7/drzJgx+vWvfy1JGj16tHbv3q2lS5dq2rRpEZ5d6D333HOaNGmSMjMzIz2VkFq9erVWrFihlStXatiwYaqtrdWsWbOUmZnZpfuRQ0inacCAAYqNjW1ztnh9fb3S09MjNKvwaa0pWuqdOXOm1q5dq9dff13nnXdeoD09PV3Nzc06cuRIUH/b6nQ6nbrwwguVl5en8vJyjRo1Sk8++WTU1FdTU6NDhw7pe9/7nuLi4hQXF6fNmzfrqaeeUlxcnNLS0qKizm9KSUnRRRddpA8//DBq9mNGRoaGDh0a1DZkyJDAobJo+tz55JNP9Nprr+lnP/tZoC1a9uPs2bM1Z84cTZkyRSNGjNDNN9+su+++W+Xl5ZK6bj8SYE6T0+lUXl6eqqqqAm1+v19VVVVyu90RnFl45ObmKj09PajehoYGbdu2zap6jTGaOXOmXn75ZW3cuFG5ublB2/Py8hQfHx9U5969e3XgwAGr6vw2v98vr9cbNfWNHz9eu3btUm1tbeAxZswYTZ06NfDf0VDnNzU2Nuqjjz5SRkZG1OzHSy65pM1tDD744APl5ORIip7PHUlatmyZUlNTVVxcHGiLlv3Y1NSkmJjg+BAbGyu/3y+pC/djyE4H7gFWrVplXC6XWb58uXnvvffMjBkzTEpKiqmrq4v01M7IsWPHzLvvvmveffddI8k89thj5t133zWffPKJMebry+BSUlLMH//4R7Nz505z7bXXWnU5ozHG3HnnnSY5Odls2rQp6NLGpqamQJ877rjDZGdnm40bN5odO3YYt9tt3G53BGfdOXPmzDGbN282+/fvNzt37jRz5swxDofDVFZWGmPsr68j37wKyRj76/zFL35hNm3aZPbv32/eeustU1BQYAYMGGAOHTpkjLG/PmO+vgQ+Li7OPPzww2bfvn1mxYoVJjEx0fz+978P9ImGz52WlhaTnZ1t7r333jbbomE/Tps2zXznO98JXEb90ksvmQEDBph77rkn0Kcr9iMBppOefvppk52dbZxOpxk7dqzZunVrpKd0xl5//XUjqc1j2rRpxpivL4V78MEHTVpamnG5XGb8+PFm7969kZ10J7VXnySzbNmyQJ8vv/zS/Mu//Ivp27evSUxMNNdff705ePBg5CbdSbfddpvJyckxTqfTnHvuuWb8+PGB8GKM/fV15NsBxvY6b7zxRpORkWGcTqf5zne+Y2688cag+6PYXl+rV1991QwfPty4XC4zePBg8+yzzwZtj4bPnQ0bNhhJ7c47GvZjQ0ODueuuu0x2drZJSEgw559/vrn//vuN1+sN9OmK/egw5hu3zgMAALAA58AAAADrEGAAAIB1CDAAAMA6BBgAAGAdAgwAALAOAQYAAFiHAAMAAKxDgAEAANYhwAAAAOsQYAAAgHUIMAAAwDoEGAAAYJ3/B5AA+1FRFLp8AAAAAElFTkSuQmCC", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "speeds.speed_mph.hist(bins=range(0, 80, 1))" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "id": "ef42d97c-3fa4-4251-b448-c69dc97e9918", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "count 67701.000000\n", + "mean 6.209578\n", + "std 3.949743\n", + "min 0.050000\n", + "25% 3.700000\n", + "50% 5.610000\n", + "75% 7.790000\n", + "max 65.790000\n", + "Name: p20_mph, dtype: float64" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "segment_speeds.p20_mph.describe()" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "id": "709faeac-5687-4951-af33-d3af4a1e0e08", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "count 67701.000000\n", + "mean 6.294138\n", + "std 3.925301\n", + "min 0.070000\n", + "25% 3.760000\n", + "50% 5.690000\n", + "75% 7.850000\n", + "max 65.790000\n", + "Name: p50_mph, dtype: float64" + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "segment_speeds.p50_mph.describe()" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "id": "afb4134a-92c9-402e-b26d-192dd7a7e72a", + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "count 67701.000000\n", + "mean 6.378770\n", + "std 3.903607\n", + "min 0.090000\n", + "25% 3.910000\n", + "50% 5.760000\n", + "75% 7.900000\n", + "max 65.790000\n", + "Name: p80_mph, dtype: float64" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "segment_speeds.p80_mph.describe()" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "id": "38ff2c1b-9014-4242-b597-faa22997d30f", + "metadata": {}, + "outputs": [], + "source": [ + "def filter_specific_stops(\n", + " speeds, \n", + " stop_pair: str\n", + "):\n", + " \"\"\"\n", + " Pull out a specific stop (using stop_pair)\n", + " and show the distribution of speeds.\n", + " First table shows time-of-day distribution (AM Peak)\n", + " and second table shows how certain time-of-day aggregations\n", + " get rolled into peak vs offpeak.\n", + " \"\"\"\n", + " \n", + " speeds_subset = speeds[\n", + " speeds.stop_pair == stop_pair].sort_values(\"speed_mph\")\n", + " \n", + " list_of_speeds = speeds_subset.groupby(\n", + " [\"time_of_day\", \"peak_offpeak\", \n", + " \"stop_pair\",\n", + " \"stop_sequence\", \"stop_sequence1\"]).agg(\n", + " {\"speed_mph\": lambda x: list(round(x, 2))}\n", + " ).reset_index()\n", + "\n", + "\n", + " list_of_speeds2 = speeds_subset.groupby(\n", + " [\"peak_offpeak\", \n", + " \"stop_pair\",\n", + " \"stop_sequence\", \"stop_sequence1\"]).agg(\n", + " {\"speed_mph\": lambda x: list(round(x, 2))}\n", + " ).reset_index()\n", + "\n", + " with pd.option_context(\n", + " 'display.max_rows', None, \n", + " 'display.max_columns', None,\n", + " # bump the last column so we can print out all the values in the\n", + " # distribution of speeds\n", + " 'display.max_colwidth', None\n", + " ): # more options can be specified also\n", + " display(list_of_speeds) \n", + " display(list_of_speeds2)\n" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "id": "92df9bc8-c502-4a29-8dc3-e49282036142", + "metadata": {}, + "outputs": [], + "source": [ + "# Find some stops that show up as green on the 20th percentile \n", + "# either AM Peak, Midday, or PM Peak\n", + "find_me = {\n", + " \"Olympic & Veteran to Olympic and Westwood\": \"688__59\",\n", + " \"some_other_stop\": \"1530__1531\",\n", + " \"Barrington & La Grange to Barrington & Olympic (Sawtelle)\":\"1392__1393\"\n", + "}" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "id": "72c0d264-b116-440e-a3d1-232eb763ef50", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
time_of_daypeak_offpeakstop_pairstop_sequencestop_sequence1speed_mph
0AM Peakpeak688__592626.0[16.85, 25.57]
1Middayoffpeak688__592626.0[13.01, 13.99, 17.24, 24.71]
2PM Peakpeak688__592626.0[7.49, 10.3, 14.83, 26.48]
\n", + "
" + ], + "text/plain": [ + " time_of_day peak_offpeak stop_pair stop_sequence stop_sequence1 \\\n", + "0 AM Peak peak 688__59 26 26.0 \n", + "1 Midday offpeak 688__59 26 26.0 \n", + "2 PM Peak peak 688__59 26 26.0 \n", + "\n", + " speed_mph \n", + "0 [16.85, 25.57] \n", + "1 [13.01, 13.99, 17.24, 24.71] \n", + "2 [7.49, 10.3, 14.83, 26.48] " + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
peak_offpeakstop_pairstop_sequencestop_sequence1speed_mph
0offpeak688__592626.0[13.01, 13.99, 17.24, 24.71]
1peak688__592626.0[7.49, 10.3, 14.83, 16.85, 25.57, 26.48]
\n", + "
" + ], + "text/plain": [ + " peak_offpeak stop_pair stop_sequence stop_sequence1 \\\n", + "0 offpeak 688__59 26 26.0 \n", + "1 peak 688__59 26 26.0 \n", + "\n", + " speed_mph \n", + "0 [13.01, 13.99, 17.24, 24.71] \n", + "1 [7.49, 10.3, 14.83, 16.85, 25.57, 26.48] " + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "filter_specific_stops(speeds, \"688__59\")" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "id": "31bb3991-aa77-4729-b400-5cebf14bd884", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
time_of_daypeak_offpeakstop_pairstop_sequencestop_sequence1speed_mph
0AM Peakpeak1530__15312020.0[4.95, 8.37, 15.87, 17.72, 18.14, 22.41, 25.39]
1Early AMoffpeak1530__15312020.0[15.87, 18.58]
2Eveningoffpeak1530__15312020.0[6.0, 18.58]
3Middayoffpeak1530__15312020.0[10.73, 11.54, 13.6, 13.6, 14.11, 14.65, 14.65, 15.55, 21.16, 24.57, 26.27, 29.3, 29.3, 58.6]
4PM Peakpeak1530__15312020.0[9.4, 9.64, 10.16, 10.29, 13.6, 13.85, 15.24, 15.87, 16.21, 16.93, 20.59, 21.77]
\n", + "
" + ], + "text/plain": [ + " time_of_day peak_offpeak stop_pair stop_sequence stop_sequence1 \\\n", + "0 AM Peak peak 1530__1531 20 20.0 \n", + "1 Early AM offpeak 1530__1531 20 20.0 \n", + "2 Evening offpeak 1530__1531 20 20.0 \n", + "3 Midday offpeak 1530__1531 20 20.0 \n", + "4 PM Peak peak 1530__1531 20 20.0 \n", + "\n", + " speed_mph \n", + "0 [4.95, 8.37, 15.87, 17.72, 18.14, 22.41, 25.39] \n", + "1 [15.87, 18.58] \n", + "2 [6.0, 18.58] \n", + "3 [10.73, 11.54, 13.6, 13.6, 14.11, 14.65, 14.65, 15.55, 21.16, 24.57, 26.27, 29.3, 29.3, 58.6] \n", + "4 [9.4, 9.64, 10.16, 10.29, 13.6, 13.85, 15.24, 15.87, 16.21, 16.93, 20.59, 21.77] " + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
peak_offpeakstop_pairstop_sequencestop_sequence1speed_mph
0offpeak1530__15312020.0[6.0, 10.73, 11.54, 13.6, 13.6, 14.11, 14.65, 14.65, 15.55, 15.87, 18.58, 18.58, 21.16, 24.57, 26.27, 29.3, 29.3, 58.6]
1peak1530__15312020.0[4.95, 8.37, 9.4, 9.64, 10.16, 10.29, 13.6, 13.85, 15.24, 15.87, 15.87, 16.21, 16.93, 17.72, 18.14, 20.59, 21.77, 22.41, 25.39]
\n", + "
" + ], + "text/plain": [ + " peak_offpeak stop_pair stop_sequence stop_sequence1 \\\n", + "0 offpeak 1530__1531 20 20.0 \n", + "1 peak 1530__1531 20 20.0 \n", + "\n", + " speed_mph \n", + "0 [6.0, 10.73, 11.54, 13.6, 13.6, 14.11, 14.65, 14.65, 15.55, 15.87, 18.58, 18.58, 21.16, 24.57, 26.27, 29.3, 29.3, 58.6] \n", + "1 [4.95, 8.37, 9.4, 9.64, 10.16, 10.29, 13.6, 13.85, 15.24, 15.87, 15.87, 16.21, 16.93, 17.72, 18.14, 20.59, 21.77, 22.41, 25.39] " + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "filter_specific_stops(speeds, \"1530__1531\")" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "id": "8b1c2a8c-0db9-4be8-ada2-ecc8d423a26c", + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
time_of_daypeak_offpeakstop_pairstop_sequencestop_sequence1speed_mph
0AM Peakpeak1392__13931212.0[2.69, 18.21, 29.68, inf]
1Middayoffpeak1392__13931212.0[6.21, 13.13, 15.41, 19.08, 19.54, 22.89]
2PM Peakpeak1392__13931212.0[2.88, 8.09, 12.52]
\n", + "
" + ], + "text/plain": [ + " time_of_day peak_offpeak stop_pair stop_sequence stop_sequence1 \\\n", + "0 AM Peak peak 1392__1393 12 12.0 \n", + "1 Midday offpeak 1392__1393 12 12.0 \n", + "2 PM Peak peak 1392__1393 12 12.0 \n", + "\n", + " speed_mph \n", + "0 [2.69, 18.21, 29.68, inf] \n", + "1 [6.21, 13.13, 15.41, 19.08, 19.54, 22.89] \n", + "2 [2.88, 8.09, 12.52] " + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
peak_offpeakstop_pairstop_sequencestop_sequence1speed_mph
0offpeak1392__13931212.0[6.21, 13.13, 15.41, 19.08, 19.54, 22.89]
1peak1392__13931212.0[2.69, 2.88, 8.09, 12.52, 18.21, 29.68, inf]
\n", + "
" + ], + "text/plain": [ + " peak_offpeak stop_pair stop_sequence stop_sequence1 \\\n", + "0 offpeak 1392__1393 12 12.0 \n", + "1 peak 1392__1393 12 12.0 \n", + "\n", + " speed_mph \n", + "0 [6.21, 13.13, 15.41, 19.08, 19.54, 22.89] \n", + "1 [2.69, 2.88, 8.09, 12.52, 18.21, 29.68, inf] " + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "filter_specific_stops(speeds, \"1392__1393\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "b9ce865d-1003-462c-ba57-bda7e4b30190", + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.13" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/rt_segment_speeds/quick_bbb_speeds.py b/rt_segment_speeds/quick_bbb_speeds.py new file mode 100644 index 000000000..c69a26086 --- /dev/null +++ b/rt_segment_speeds/quick_bbb_speeds.py @@ -0,0 +1,165 @@ +""" +Quick script to put together Big Blue Bus speedmap +segment speeds. + +Use this for exploratory work on where 20th, 80th +percentile speeds seem low...be able to pull out +the full distribution of speeds going into that. + +Aligned earlier steps to factor in dwell time, +only interpolate between the "before" and "after" stop +position point. + +Adapt this later into `stop_arrivals_to_speeds` and +`average_segment_speeds`. +""" +import geopandas as gpd +import pandas as pd + +from segment_speed_utils import (gtfs_schedule_wrangling, + helpers, segment_calcs) + +from segment_speed_utils.project_vars import SEGMENT_GCS, GTFS_DATA_DICT +from shared_utils import rt_dates, rt_utils + +analysis_date = rt_dates.DATES["apr2024"] + +def calculate_speeds(df: pd.DataFrame, trip_stop_cols: list): + # Truncate this function for now - this sits in `stop_arrivals_to_speeds` + # set our own trip_stop_cols for speedmap segments + trip_cols = ["trip_instance_key"] + + + df = segment_calcs.convert_timestamp_to_seconds( + df, ["arrival_time"] + ).sort_values(trip_stop_cols).reset_index(drop=True) + + df = df.assign( + subseq_arrival_time_sec = (df.groupby(trip_cols, + observed=True, group_keys=False) + .arrival_time_sec + .shift(-1) + ), + subseq_stop_meters = (df.groupby(trip_cols, + observed=True, group_keys=False) + .stop_meters + .shift(-1) + ) + ) + + speed = df.assign( + meters_elapsed = df.subseq_stop_meters - df.stop_meters, + sec_elapsed = df.subseq_arrival_time_sec - df.arrival_time_sec, + ).pipe( + segment_calcs.derive_speed, + ("stop_meters", "subseq_stop_meters"), + ("arrival_time_sec", "subseq_arrival_time_sec") + ) + + return speed + + +def bbb_speeds_by_trip(analysis_date: str): + bbb_arrivals = pd.read_parquet( + f"{SEGMENT_GCS}test_arrivals_{analysis_date}.parquet" + ) + + # double check this and make sure list is partially in GTFS_DATA_DICT.trip_stop_cols + trip_stop_cols = [ + "trip_instance_key", "stop_sequence", + "stop_sequence1", "stop_pair" + ] + ["shape_array_key"] + + + # Needed to reconcile speedmap segments stop_sequenc1 (which can be missing + # and fill it in with stop_sequence + # otherwise merges downstream will drop too many rows + speeds = calculate_speeds( + bbb_arrivals, + trip_stop_cols + ["stop_meters"] + ).pipe( + gtfs_schedule_wrangling.fill_missing_stop_sequence1 + ) + + time_of_day = ( + gtfs_schedule_wrangling.get_trip_time_buckets(analysis_date) + [["trip_instance_key", "time_of_day"]] + ) + + speeds2 = pd.merge( + speeds.assign(service_date = pd.to_datetime(analysis_date)), + time_of_day, + on = "trip_instance_key", + how = "inner" + ).pipe( + gtfs_schedule_wrangling.add_peak_offpeak_column + ).pipe( + gtfs_schedule_wrangling.add_weekday_weekend_column + ) + + return speeds2 + + +def average_bbb_segment_speeds_with_geom( + analysis_date: str, + speeds: pd.DataFrame +) -> gpd.GeoDataFrame: + + # double check this and make sure list is partially in GTFS_DATA_DICT.trip_stop_cols + # use same list as above + trip_stop_cols = [ + "trip_instance_key", "stop_sequence", + "stop_sequence1", "stop_pair" + ] + ["shape_array_key"] + + + SPEEDMAP_SEGMENTS = GTFS_DATA_DICT.speedmap_segments.segments_file + + segment_geom = gpd.read_parquet( + f"{SEGMENT_GCS}{SPEEDMAP_SEGMENTS}_{analysis_date}.parquet" + ) + + speeds_with_geom = pd.merge( + segment_geom, + speeds[speeds.speed_mph <= 80], + # this filtering would be present in averaging already + on = trip_stop_cols + ) + + avg_speeds = segment_calcs.calculate_avg_speeds( + speeds_with_geom, + ["route_id", "direction_id", "stop_pair", "segment_id", "peak_offpeak"] + ) + + segment_gdf = pd.merge( + segment_geom[["route_id", "direction_id", + "stop_id1", "stop_sequence", "stop_id2", "stop_sequence1", + "shape_array_key", + "segment_id", + "stop_pair"] + ["geometry"]], + avg_speeds, + on = ["route_id", "direction_id", + "segment_id", + "stop_pair"] + ) + + segment_gdf = segment_gdf.assign( + geometry = segment_gdf.apply( + lambda x: rt_utils.try_parallel(x.geometry), + axis=1) + ) + + return segment_gdf + +if __name__ == "__main__": + + bbb_speeds = bbb_speeds_by_trip(analysis_date) + bbb_segment_speeds = average_bbb_segment_speeds_with_geom(analysis_date, bbb_speeds) + + bbb_speeds.to_parquet( + f"{SEGMENT_GCS}bbb_speeds_by_trip_{analysis_date}.parquet" + ) + + bbb_segment_speeds.to_parquet( + f"{SEGMENT_GCS}bbb_segment_speeds_gdf_{analysis_date}.parquet" + ) diff --git a/rt_segment_speeds/scripts/new_interpolate.py b/rt_segment_speeds/scripts/new_interpolate.py new file mode 100644 index 000000000..271cdaf47 --- /dev/null +++ b/rt_segment_speeds/scripts/new_interpolate.py @@ -0,0 +1,70 @@ +import datetime +import geopandas as gpd +import numpy as np +import pandas as pd +import shapely + +from segment_speed_utils import helpers, wrangle_shapes +from segment_speed_utils.project_vars import SEGMENT_GCS, GTFS_DATA_DICT, PROJECT_CRS +from shared_utils import rt_dates +import interpolate_stop_arrival + +analysis_date = rt_dates.DATES["apr2024"] + +def add_arrival_time(analysis_date: str): + gdf = gpd.read_parquet( + f"{SEGMENT_GCS}test_nearest2_vp_to_stop_{analysis_date}.parquet" + ) + + arrival_time_series = [] + + for row in gdf.itertuples(): + + stop_position = getattr(row, "stop_meters") + + projected_points = np.asarray([ + getattr(row, "prior_shape_meters"), + getattr(row, "subseq_shape_meters") + ]) + + timestamp_arr = np.asarray([ + getattr(row, "prior_vp_timestamp_local"), + getattr(row, "subseq_vp_timestamp_local"), + ]) + + + interpolated_arrival = wrangle_shapes.interpolate_stop_arrival_time( + stop_position, projected_points, timestamp_arr) + + arrival_time_series.append(interpolated_arrival) + + gdf["arrival_time"] = arrival_time_series + + drop_cols = [i for i in gdf.columns if + ("prior_" in i) or ("subseq_" in i)] + + gdf2 = gdf.drop(columns = drop_cols + ["stop_geometry"]) + + return gdf2 + +if __name__ == "__main__": + + start = datetime.datetime.now() + + trip_stop_cols = [ + "trip_instance_key", "stop_sequence", + "stop_sequence1"] + ["shape_array_key", "stop_pair", "stop_meters"] + #trip_stop_cols = [*dict_inputs["trip_stop_cols"]] + + results = add_arrival_time(analysis_date) + + results = interpolate_stop_arrival.enforce_monotonicity_and_interpolate_across_stops( + results, trip_stop_cols) + + + results.to_parquet( + f"{SEGMENT_GCS}test_arrivals_{analysis_date}.parquet" + ) + + end = datetime.datetime.now() + print(f"test arrivals (BBB): {end - start}") diff --git a/rt_segment_speeds/scripts/new_narrow_to_2.py b/rt_segment_speeds/scripts/new_narrow_to_2.py new file mode 100644 index 000000000..29acb96f7 --- /dev/null +++ b/rt_segment_speeds/scripts/new_narrow_to_2.py @@ -0,0 +1,197 @@ +import datetime +import geopandas as gpd +import numpy as np +import pandas as pd +import shapely + +from calitp_data_analysis import utils +from calitp_data_analysis.geography_utils import WGS84 +from segment_speed_utils import helpers, wrangle_shapes +from segment_speed_utils.project_vars import SEGMENT_GCS, GTFS_DATA_DICT, PROJECT_CRS +from shared_utils import rt_dates + +analysis_date = rt_dates.DATES["apr2024"] + +def merge_nearest_vp_with_shape(analysis_date: str): + vp = gpd.read_parquet( + f"{SEGMENT_GCS}nearest/" + f"test_nearest_vp_to_stop_{analysis_date}.parquet", + ) + + shapes = helpers.import_scheduled_shapes( + analysis_date, + columns = ["shape_array_key", "geometry"], + filters = [[("shape_array_key", "in", vp.shape_array_key.unique())]], + crs = PROJECT_CRS, + get_pandas = True + ) + + vp_with_shape = pd.merge( + vp, + shapes.rename(columns = {"geometry": "shape_geometry"}), + on = "shape_array_key", + how = "inner" + ) + + return vp_with_shape + + +def explode_vp_and_project_onto_shape( + vp_with_shape: gpd.GeoDataFrame, + analysis_date: str +) -> gpd.GeoDataFrame: + vp_long = vp_with_shape.explode( + "nearest_vp_arr" + ).reset_index(drop=True).rename( + columns = {"nearest_vp_arr": "vp_idx"} + ) + + subset_vp = vp_long.vp_idx.unique().tolist() + + vp_with_dwell = pd.read_parquet( + f"{SEGMENT_GCS}vp_usable_with_dwell_{analysis_date}", + filters = [[("vp_idx", "in", subset_vp)]], + columns = ["vp_idx", "x", "y", "location_timestamp_local", + "moving_timestamp_local"] + ).pipe(wrangle_shapes.vp_as_gdf, crs = PROJECT_CRS) + + gdf = pd.merge( + vp_long, + vp_with_dwell.rename(columns = {"geometry": "vp_geometry"}), + on = "vp_idx", + how = "inner" + ) + + gdf = gdf.assign( + stop_meters = gdf.shape_geometry.project(gdf.stop_geometry), + shape_meters = gdf.shape_geometry.project(gdf.vp_geometry) + ) + + gdf = gdf.assign( + stop_vp_distance_meters = (gdf.stop_meters - + gdf.shape_meters).round(2) + ) + + return gdf + + +def find_two_closest_vp(gdf: gpd.GeoDataFrame, group_cols: list): + + positive_distances_df = gdf.loc[gdf.stop_vp_distance_meters >= 0] + negative_distances_df = gdf.loc[gdf.stop_vp_distance_meters < 0] + + #https://github.com/pandas-dev/pandas/issues/45089 + # add dropna=False or else too many combos are lost + min_pos_distance = ( + positive_distances_df + .groupby(group_cols, + observed=True, group_keys=False, dropna=False) + .agg({"stop_vp_distance_meters": "min"}) + .reset_index() + ) + + min_neg_distance = ( + negative_distances_df + .groupby(group_cols, + observed=True, group_keys=False, dropna=False) + .agg({"stop_vp_distance_meters": "max"}) + .reset_index() + ) + + two_vp = pd.concat( + [min_pos_distance, min_neg_distance], + axis=0, ignore_index=True + ) + + gdf2 = pd.merge( + gdf, + two_vp, + on = group_cols + ["stop_vp_distance_meters"], + how = "inner" + ) + + # since shape_meters actually might be decreasing as time progresses, + # (bus moving back towards origin of shape) + # we don't actually know that the smaller shape_meters is the first timestamp + # nor the larger shape_meters is the second timestamp. + # all we know is that stop_meters (stop) falls between these 2 shape_meters. + # sort by timestamp, and set the order to be 0, 1 + + return gdf2 + + +def consolidate_surrounding_vp(df, group_cols): + df = df.assign( + obs = (df.sort_values(group_cols + ["vp_idx"]) + .groupby(group_cols, + observed=True, group_keys=False, dropna=False) + .cumcount() + ) + ) + + if "stop_meters" not in group_cols: + group_cols = group_cols + ["stop_meters"] + + group_cols2 = group_cols + ["stop_geometry"] + prefix_cols = ["vp_idx", "shape_meters"] + timestamp_cols = ["location_timestamp_local", "moving_timestamp_local"] + + + vp_before_stop = df.loc[df.obs==0][group_cols2 + prefix_cols + timestamp_cols] + vp_after_stop = df.loc[df.obs==1][group_cols2 + prefix_cols + timestamp_cols] + + # For the vp before the stop occurs, we want the maximum timestamp + # of the last position + # We want to keep the moving_timestamp (which is after it's dwelled) + vp_before_stop = vp_before_stop.assign( + prior_vp_timestamp_local = vp_before_stop.moving_timestamp_local, + ).rename( + columns = {**{i: f"prior_{i}" for i in prefix_cols}} + ).drop(columns = timestamp_cols) + + # For the vp after the stop occurs, we want the minimum timestamp + # of that next position + # Keep location_timetamp (before it dwells) + vp_after_stop = vp_after_stop.assign( + subseq_vp_timestamp_local = vp_after_stop.location_timestamp_local, + ).rename( + columns = {**{i: f"subseq_{i}" for i in prefix_cols}} + ).drop(columns = timestamp_cols) + + df_wide = pd.merge( + vp_before_stop, + vp_after_stop, + on = group_cols2, + how = "inner" + ) + + return df_wide + + +if __name__ == "__main__": + + start = datetime.datetime.now() + + gdf = merge_nearest_vp_with_shape(analysis_date) + + gdf2 = explode_vp_and_project_onto_shape(gdf, analysis_date) + + # These are defined in GTFS_DATA_DICT + trip_stop_cols = [ + "trip_instance_key", "stop_sequence", + "stop_sequence1"] + ["shape_array_key", "stop_pair", "stop_meters"] + + gdf3 = find_two_closest_vp(gdf2, trip_stop_cols).sort_values( + trip_stop_cols + ["vp_idx"] + ).reset_index(drop=True) + + gdf4 = consolidate_surrounding_vp(gdf3, trip_stop_cols) + + utils.geoparquet_gcs_export( + gdf4, + SEGMENT_GCS, + f"test_nearest2_vp_to_stop_{analysis_date}" + ) + + end = datetime.datetime.now() + print(f"narrow down to 2 nearest vp (BBB): {end - start}") \ No newline at end of file diff --git a/rt_segment_speeds/scripts/new_nearest_10.py b/rt_segment_speeds/scripts/new_nearest_10.py new file mode 100644 index 000000000..90a61806c --- /dev/null +++ b/rt_segment_speeds/scripts/new_nearest_10.py @@ -0,0 +1,164 @@ +import datetime +import geopandas as gpd +import numpy as np +import pandas as pd +import shapely + +from scipy.spatial import KDTree + +from calitp_data_analysis import utils +from calitp_data_analysis.geography_utils import WGS84 +from segment_speed_utils import helpers#, neighbor +from segment_speed_utils.project_vars import SEGMENT_GCS, GTFS_DATA_DICT, PROJECT_CRS +from shared_utils import rt_dates + +analysis_date = rt_dates.DATES["apr2024"] + +import nearest_vp_to_stop + +def get_subset_trips(analysis_date: str) -> list: + bbb_trips = helpers.import_scheduled_trips( + analysis_date, + filters = [("name", "==", "Big Blue Bus Schedule")], + columns = ["gtfs_dataset_key", "trip_instance_key"], + get_pandas = True + ) + + bbb_key = bbb_trips.schedule_gtfs_dataset_key.iloc[0] + subset_trips = bbb_trips.trip_instance_key.unique() + + return subset_trips + +def construct_stop_times( + analysis_date: str, + subset_trips: list +) -> gpd.GeoDataFrame: + + # Grab the relevant stop times rows + # will need to concatenate RT stop times (all trips) + # with additional segments for speedmaps + rt_stop_times = ( + nearest_vp_to_stop.stop_times_for_all_trips(analysis_date) + .query('trip_instance_key in @subset_trips') + ) + + proxy_stop_times = ( + nearest_vp_to_stop.stop_times_for_speedmaps(analysis_date) + .query('trip_instance_key in @subset_trips') + ) + + bbb_stop_times = pd.concat( + [rt_stop_times, proxy_stop_times], + axis=0, ignore_index=True + ) + + return bbb_stop_times + +def merge_stop_vp_for_nearest_neighbor( + stop_times: gpd.GeoDataFrame, + analysis_date: str, + **kwargs +) -> gpd.GeoDataFrame: + + vp_condensed = gpd.read_parquet( + f"{SEGMENT_GCS}condensed/" + f"vp_nearest_neighbor_dwell_{analysis_date}.parquet", + **kwargs + ).to_crs(PROJECT_CRS) + + gdf = pd.merge( + stop_times.rename( + columns = { + "geometry": "stop_geometry"} + ).set_geometry("stop_geometry").to_crs(PROJECT_CRS), + vp_condensed.rename( + columns = { + "vp_primary_direction": "stop_primary_direction", + "geometry": "vp_geometry" + }), + on = ["trip_instance_key", "stop_primary_direction"], + how = "inner" + ) + + return gdf + +def nearest_snap( + line: shapely.LineString, + point: shapely.Point, + k_neighbors: int +) -> int: + """ + Based off of this function, + but we want to return the index value, rather than the point. + https://github.com/UTEL-UIUC/gtfs_segments/blob/main/gtfs_segments/geom_utils.py + """ + line = np.asarray(line.coords) + point = np.asarray(point.coords) + tree = KDTree(line) + + # np_dist is array of distances of result + # np_inds is array of indices of result + np_dist, np_inds = tree.query( + point, workers=-1, k=k_neighbors, + ) + + return np_dist.squeeze(), np_inds.squeeze() + + +if __name__ == "__main__": + + start = datetime.datetime.now() + + subset_trips = get_subset_trips(analysis_date) + + bbb_stop_times = construct_stop_times(analysis_date, subset_trips) + + # This is with opposite direction removed + gdf = merge_stop_vp_for_nearest_neighbor( + bbb_stop_times, + analysis_date, + filters = [[("trip_instance_key", "in", subset_trips)]], + # just keep columns for merge + vp_idx and vp_geometry + columns = [ + "trip_instance_key", "vp_idx", + "vp_primary_direction", "geometry" + ], + ) + + N_NEAREST_POINTS = 10 + + nearest_vp_arr_series = [] + + for row in gdf.itertuples(): + vp_coords_line = getattr(row, "vp_geometry") + stop_geometry = getattr(row, "stop_geometry") + vp_idx_arr = getattr(row, "vp_idx") + + _, np_inds = nearest_snap( + vp_coords_line, stop_geometry, N_NEAREST_POINTS + ) + + # nearest neighbor returns self.N + # if there are no nearest neighbor results found + # if we want 10 nearest neighbors and 8th, 9th, 10th are all + # the same result, the 8th will have a result, then 9th and 10th will + # return the length of the array (which is out-of-bounds) + + np_inds2 = np_inds[np_inds < vp_idx_arr.size] + + nearest_vp_arr = vp_idx_arr[np_inds2] + + nearest_vp_arr_series.append(nearest_vp_arr) + + gdf2 = gdf.assign( + nearest_vp_arr = nearest_vp_arr_series + ).drop(columns = ["vp_idx", "vp_geometry"]) + + utils.geoparquet_gcs_export( + gdf2, + SEGMENT_GCS, + f"nearest/test_nearest_vp_to_stop_{analysis_date}" + ) + + end = datetime.datetime.now() + print(f"save nearest 10 (BBB): {end - start}") \ No newline at end of file diff --git a/rt_segment_speeds/segment_speed_utils/vp_transform.py b/rt_segment_speeds/segment_speed_utils/vp_transform.py index 48d226021..32a157a63 100644 --- a/rt_segment_speeds/segment_speed_utils/vp_transform.py +++ b/rt_segment_speeds/segment_speed_utils/vp_transform.py @@ -72,6 +72,7 @@ def combine_valid_vp_for_direction( coords_series = [] vp_idx_series = [] timestamp_series = [] + moving_timestamp_series = [] for row in vp_condensed.itertuples(): vp_dir_arr = np.asarray(getattr(row, "vp_primary_direction")) @@ -87,6 +88,10 @@ def combine_valid_vp_for_direction( timestamp_arr = np.asarray( getattr(row, "location_timestamp_local")) + moving_timestamp_arr = np.asarray( + getattr(row, "moving_timestamp_local") + ) + vp_linestring = coords_arr[valid_indices] if len(vp_linestring) > 1: @@ -100,6 +105,7 @@ def combine_valid_vp_for_direction( coords_series.append(valid_vp_line) vp_idx_series.append(vp_idx_arr[valid_indices]) timestamp_series.append(timestamp_arr[valid_indices]) + moving_timestamp_series.append(moving_timestamp_arr[valid_indices]) vp_condensed = vp_condensed.assign( @@ -107,8 +113,12 @@ def combine_valid_vp_for_direction( geometry = coords_series, vp_idx = vp_idx_series, location_timestamp_local = timestamp_series, + moving_timestamp_local = moving_timestamp_series, )[["trip_instance_key", "vp_primary_direction", - "geometry", "vp_idx", "location_timestamp_local"]].reset_index(drop=True) + "geometry", "vp_idx", + "location_timestamp_local", + "moving_timestamp_local" + ]].reset_index(drop=True) gdf = gpd.GeoDataFrame( vp_condensed,