From 8c0dd855624e44f6e2cbfa301386065d132c34b5 Mon Sep 17 00:00:00 2001 From: tiffanychu90 Date: Wed, 26 Jul 2023 17:50:04 +0000 Subject: [PATCH] run may-jul through pipeline, remove calculate-trip-avgs notebook --- rt_segment_speeds/logs/cut_stop_segments.log | 8 + rt_segment_speeds/logs/prep_stop_segments.log | 1 + rt_segment_speeds/logs/sjoin_vp_segments.log | 12 + .../logs/speeds_by_segment_trip.log | 15 + .../logs/valid_vehicle_positions.log | 27 + .../scripts/calculate-trip-avg.ipynb | 917 ------------------ 6 files changed, 63 insertions(+), 917 deletions(-) delete mode 100644 rt_segment_speeds/scripts/calculate-trip-avg.ipynb diff --git a/rt_segment_speeds/logs/cut_stop_segments.log b/rt_segment_speeds/logs/cut_stop_segments.log index df7e8ea74..d98de510f 100644 --- a/rt_segment_speeds/logs/cut_stop_segments.log +++ b/rt_segment_speeds/logs/cut_stop_segments.log @@ -37,3 +37,11 @@ 2023-07-25 15:31:25.425 | INFO | __main__::313 - Cut special stop segments: 0:28:46.570691 2023-07-25 15:31:26.518 | INFO | __main__::332 - export results: 0:00:01.093220 2023-07-25 15:31:26.520 | INFO | __main__::333 - execution time: 0:28:47.663911 +2023-07-25 16:07:13.609 | INFO | __main__::198 - Analysis date: 2023-07-12 +2023-07-25 16:08:07.369 | INFO | __main__::240 - Cut normal stop segments: 0:00:53.759088 +2023-07-25 16:19:44.770 | INFO | __main__::252 - Export results: 0:11:37.401262 +2023-07-25 16:19:44.770 | INFO | __main__::255 - execution time: 0:12:31.160955 +2023-07-25 16:20:26.164 | INFO | __main__::293 - Analysis date: 2023-07-12 +2023-07-25 16:48:16.067 | INFO | __main__::313 - Cut special stop segments: 0:27:49.863811 +2023-07-25 16:48:17.367 | INFO | __main__::332 - export results: 0:00:01.299823 +2023-07-25 16:48:17.369 | INFO | __main__::333 - execution time: 0:27:51.163634 diff --git a/rt_segment_speeds/logs/prep_stop_segments.log b/rt_segment_speeds/logs/prep_stop_segments.log index 45c4d8cee..4014ced4c 100644 --- a/rt_segment_speeds/logs/prep_stop_segments.log +++ b/rt_segment_speeds/logs/prep_stop_segments.log @@ -18,3 +18,4 @@ 2023-07-25 14:39:19.717 | INFO | __main__::290 - Analysis date: 2023-06-14 2023-07-25 14:49:25.861 | INFO | __main__::297 - Prep stop segment df: 0:10:06.141854 2023-07-25 14:50:12.947 | INFO | __main__::307 - execution time: 0:10:53.228185 +2023-07-25 15:55:48.582 | INFO | __main__::290 - Analysis date: 2023-07-12 diff --git a/rt_segment_speeds/logs/sjoin_vp_segments.log b/rt_segment_speeds/logs/sjoin_vp_segments.log index 16eb2d1ac..932e622d6 100644 --- a/rt_segment_speeds/logs/sjoin_vp_segments.log +++ b/rt_segment_speeds/logs/sjoin_vp_segments.log @@ -91,3 +91,15 @@ 2023-07-25 10:47:55.030 | INFO | __main__::298 - attach vp to stop-to-stop segments: 0:44:52.057560 2023-07-25 10:48:51.630 | INFO | __main__::308 - compiled parquets: 0:00:56.599756 2023-07-25 10:48:51.630 | INFO | __main__::309 - execution time: 0:45:48.657316 +2023-07-25 16:52:12.138 | INFO | __main__::286 - Analysis date: 2023-05-17 +2023-07-25 17:34:09.962 | INFO | __main__::298 - attach vp to stop-to-stop segments: 0:41:57.812287 +2023-07-25 17:35:06.440 | INFO | __main__::308 - compiled parquets: 0:00:56.477889 +2023-07-25 17:35:06.441 | INFO | __main__::309 - execution time: 0:42:54.290176 +2023-07-25 19:29:22.771 | INFO | __main__::286 - Analysis date: 2023-06-14 +2023-07-25 20:10:03.035 | INFO | __main__::298 - attach vp to stop-to-stop segments: 0:40:40.263436 +2023-07-25 20:11:00.673 | INFO | __main__::308 - compiled parquets: 0:00:57.637347 +2023-07-25 20:11:00.675 | INFO | __main__::309 - execution time: 0:41:37.900783 +2023-07-25 21:27:52.866 | INFO | __main__::286 - Analysis date: 2023-07-12 +2023-07-25 22:11:55.296 | INFO | __main__::298 - attach vp to stop-to-stop segments: 0:44:02.429077 +2023-07-25 22:13:11.709 | INFO | __main__::308 - compiled parquets: 0:01:16.412710 +2023-07-25 22:13:11.713 | INFO | __main__::309 - execution time: 0:45:18.841787 diff --git a/rt_segment_speeds/logs/speeds_by_segment_trip.log b/rt_segment_speeds/logs/speeds_by_segment_trip.log index dacbb21f9..9992947cc 100644 --- a/rt_segment_speeds/logs/speeds_by_segment_trip.log +++ b/rt_segment_speeds/logs/speeds_by_segment_trip.log @@ -46,3 +46,18 @@ 2023-07-25 11:49:17.344 | INFO | __main__:linear_referencing_and_speed_by_segment:96 - calculate speeds: 0:00:00.006385 2023-07-25 12:16:28.698 | INFO | __main__::127 - speeds for stop segments: 1:04:00.240843 2023-07-25 12:16:28.699 | INFO | __main__::128 - execution time: 1:04:00.241659 +2023-07-25 17:55:40.998 | INFO | __main__::116 - Analysis date: 2023-05-17 +2023-07-25 18:25:17.498 | INFO | __main__:linear_referencing_and_speed_by_segment:84 - linear referencing: 0:29:36.467955 +2023-07-25 18:25:17.508 | INFO | __main__:linear_referencing_and_speed_by_segment:96 - calculate speeds: 0:00:00.009634 +2023-07-25 18:51:59.936 | INFO | __main__::127 - speeds for stop segments: 0:56:18.910904 +2023-07-25 18:51:59.937 | INFO | __main__::128 - execution time: 0:56:18.911558 +2023-07-25 20:31:00.338 | INFO | __main__::116 - Analysis date: 2023-06-14 +2023-07-25 20:59:50.907 | INFO | __main__:linear_referencing_and_speed_by_segment:84 - linear referencing: 0:28:50.563683 +2023-07-25 20:59:50.913 | INFO | __main__:linear_referencing_and_speed_by_segment:96 - calculate speeds: 0:00:00.006050 +2023-07-25 21:26:12.883 | INFO | __main__::127 - speeds for stop segments: 0:55:12.544803 +2023-07-25 21:26:12.884 | INFO | __main__::128 - execution time: 0:55:12.545554 +2023-07-25 22:48:46.313 | INFO | __main__::116 - Analysis date: 2023-07-12 +2023-07-25 23:31:55.464 | INFO | __main__:linear_referencing_and_speed_by_segment:84 - linear referencing: 0:43:09.123043 +2023-07-25 23:31:55.493 | INFO | __main__:linear_referencing_and_speed_by_segment:96 - calculate speeds: 0:00:00.028630 +2023-07-26 00:00:04.530 | INFO | __main__::127 - speeds for stop segments: 1:11:18.196214 +2023-07-26 00:00:04.531 | INFO | __main__::128 - execution time: 1:11:18.197170 diff --git a/rt_segment_speeds/logs/valid_vehicle_positions.log b/rt_segment_speeds/logs/valid_vehicle_positions.log index 93b10b576..1b0e441de 100644 --- a/rt_segment_speeds/logs/valid_vehicle_positions.log +++ b/rt_segment_speeds/logs/valid_vehicle_positions.log @@ -159,3 +159,30 @@ 2023-07-25 10:57:31.262 | INFO | __main__::344 - Analysis date: 2023-07-12 2023-07-25 11:10:59.353 | INFO | __main__::358 - pare down vp by stop segments special cases 0:13:28.085738 2023-07-25 11:10:59.355 | INFO | __main__::361 - execution time: 0:13:28.091919 +2023-07-25 17:35:24.578 | INFO | __main__::157 - Analysis date: 2023-05-17 +2023-07-25 17:38:17.011 | INFO | __main__:pare_down_vp_by_segment:130 - merge usable vp with sjoin results: 0:02:52.414746 +2023-07-25 17:38:17.141 | INFO | __main__:pare_down_vp_by_segment:139 - keep enter/exit points: 0:00:00.130051 +2023-07-25 17:42:44.665 | INFO | __main__:pare_down_vp_by_segment:145 - exported: 0:04:27.523972 +2023-07-25 17:42:44.667 | INFO | __main__::171 - pare down vp by stop segments normal cases 0:07:20.071086 +2023-07-25 17:42:44.667 | INFO | __main__::174 - execution time: 0:07:20.075709 +2023-07-25 17:43:02.687 | INFO | __main__::344 - Analysis date: 2023-05-17 +2023-07-25 17:54:28.505 | INFO | __main__::358 - pare down vp by stop segments special cases 0:11:25.812402 +2023-07-25 17:54:28.506 | INFO | __main__::361 - execution time: 0:11:25.817451 +2023-07-25 20:11:18.015 | INFO | __main__::157 - Analysis date: 2023-06-14 +2023-07-25 20:13:59.584 | INFO | __main__:pare_down_vp_by_segment:130 - merge usable vp with sjoin results: 0:02:41.562999 +2023-07-25 20:13:59.739 | INFO | __main__:pare_down_vp_by_segment:139 - keep enter/exit points: 0:00:00.154597 +2023-07-25 20:18:02.221 | INFO | __main__:pare_down_vp_by_segment:145 - exported: 0:04:02.482467 +2023-07-25 20:18:02.223 | INFO | __main__::171 - pare down vp by stop segments normal cases 0:06:44.202068 +2023-07-25 20:18:02.224 | INFO | __main__::174 - execution time: 0:06:44.207623 +2023-07-25 20:18:18.862 | INFO | __main__::344 - Analysis date: 2023-06-14 +2023-07-25 20:29:41.529 | INFO | __main__::358 - pare down vp by stop segments special cases 0:11:22.662479 +2023-07-25 20:29:41.530 | INFO | __main__::361 - execution time: 0:11:22.667589 +2023-07-25 22:13:36.796 | INFO | __main__::157 - Analysis date: 2023-07-12 +2023-07-25 22:16:54.225 | INFO | __main__:pare_down_vp_by_segment:130 - merge usable vp with sjoin results: 0:03:17.415267 +2023-07-25 22:16:54.345 | INFO | __main__:pare_down_vp_by_segment:139 - keep enter/exit points: 0:00:00.119414 +2023-07-25 22:22:49.255 | INFO | __main__:pare_down_vp_by_segment:145 - exported: 0:05:54.910487 +2023-07-25 22:22:49.260 | INFO | __main__::171 - pare down vp by stop segments normal cases 0:09:12.450290 +2023-07-25 22:22:49.263 | INFO | __main__::174 - execution time: 0:09:12.463640 +2023-07-25 22:23:28.707 | INFO | __main__::344 - Analysis date: 2023-07-12 +2023-07-25 22:46:48.934 | INFO | __main__::358 - pare down vp by stop segments special cases 0:23:20.216783 +2023-07-25 22:46:48.962 | INFO | __main__::361 - execution time: 0:23:20.251985 diff --git a/rt_segment_speeds/scripts/calculate-trip-avg.ipynb b/rt_segment_speeds/scripts/calculate-trip-avg.ipynb deleted file mode 100644 index fecd4ac20..000000000 --- a/rt_segment_speeds/scripts/calculate-trip-avg.ipynb +++ /dev/null @@ -1,917 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "id": "8f0f779b-50a4-4fa6-9fe9-9632c161316a", - "metadata": {}, - "source": [ - "# Average speeds across entire trip" - ] - }, - { - "cell_type": "code", - "execution_count": 34, - "id": "52c8d239-4897-42e0-80e0-2352d4b0a79b", - "metadata": {}, - "outputs": [], - "source": [ - "#import os\n", - "#os.environ['USE_PYGEOS'] = '0'\n", - "# turning this off makes to_crs really slow\n", - "\n", - "import dask.dataframe as dd\n", - "import dask_geopandas as dg\n", - "import folium\n", - "import geopandas as gpd\n", - "import numpy as np\n", - "import pandas as pd\n", - "import shapely\n", - "\n", - "from segment_speed_utils import helpers, sched_rt_utils, wrangle_shapes\n", - "from segment_speed_utils.project_vars import (SEGMENT_GCS,\n", - " CONFIG_PATH, PROJECT_CRS\n", - " )\n", - "analysis_date = \"2023-05-17\"" - ] - }, - { - "cell_type": "code", - "execution_count": 35, - "id": "5e5d3ceb-f859-4583-b361-e1b51c668a68", - "metadata": {}, - "outputs": [], - "source": [ - "df = pd.read_parquet(f\"{SEGMENT_GCS}trip_summary/trip_speed_{analysis_date}.parquet\")" - ] - }, - { - "cell_type": "code", - "execution_count": 43, - "id": "07fb6ffa-6fdd-4a14-be21-c31d6afe653f", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "(31, 14)" - ] - }, - "execution_count": 43, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "df[df.speed_mph >= 60].shape" - ] - }, - { - "cell_type": "code", - "execution_count": 42, - "id": "85dca4cc-9b3a-4b3a-806e-51a8d1539833", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "(4686, 14)" - ] - }, - "execution_count": 42, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "df[df.speed_mph <= 3].shape" - ] - }, - { - "cell_type": "code", - "execution_count": 41, - "id": "fe0a82c9-214f-4c8f-9928-7da9b9a8adf3", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "(68556, 14)" - ] - }, - "execution_count": 41, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "df.shape" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "id": "7e3f17f1-1883-41c0-aee2-6f560812442a", - "metadata": {}, - "outputs": [], - "source": [ - "# in case there are fewer shapes to grab\n", - "shapes_list = df.shape_array_key.unique().tolist()\n", - "\n", - "shapes = helpers.import_scheduled_shapes(\n", - " analysis_date,\n", - " columns = [\"shape_array_key\",\"geometry\"],\n", - " filters = [[(\"shape_array_key\", \"in\", shapes_list)]],\n", - " get_pandas = True,\n", - " crs = PROJECT_CRS\n", - ")" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "id": "0122c274-e71c-407e-84c3-743899e9b525", - "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/opt/conda/lib/python3.9/site-packages/pygeos/linear.py:87: RuntimeWarning: invalid value encountered in line_locate_point\n" - ] - } - ], - "source": [ - "linear_ref = wrangle_shapes.linear_reference_vp_against_segment(\n", - " df,\n", - " shapes,\n", - " segment_identifier_cols = [\"shape_array_key\"]\n", - ").compute()\n", - "\n", - "linear_ref.to_parquet(\"test.parquet\")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "0c66c64a-a4b9-454a-9e7e-f6253085c160", - "metadata": {}, - "outputs": [], - "source": [ - "linear_ref = pd.read_parquet(\"test.parquet\")" - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "id": "f4b557a8-019a-4ea5-b544-2a810515c5ab", - "metadata": {}, - "outputs": [], - "source": [ - "def distance_and_seconds_elapsed(\n", - " df: pd.DataFrame, \n", - " group_cols: list\n", - ") -> pd.DataFrame:\n", - " \"\"\"\n", - " If every trip has 3 vp, we want the change in time and distance\n", - " between 1st and 2nd, 2nd and 3rd.\n", - " Then, sum up the change in time and change by trip.\n", - " \"\"\"\n", - " dist_col = \"shape_meters\"\n", - " time_col = \"location_timestamp_local\"\n", - " sort_cols = group_cols + [\"vp_idx\"]\n", - " \n", - "\n", - " df = df.assign(\n", - " prior_dist = (df.sort_values(sort_cols)\n", - " .groupby(group_cols, \n", - " observed=True, group_keys=False)\n", - " [dist_col]\n", - " .apply(lambda x: x.shift(1))\n", - " ),\n", - " prior_time = (df.sort_values(sort_cols)\n", - " .groupby(group_cols, \n", - " observed=True, group_keys=False)\n", - " [time_col]\n", - " .apply(lambda x: x.shift(1))\n", - " ) \n", - " )\n", - " \n", - " df = df.assign(\n", - " change_meters = df[dist_col] - df.prior_dist,\n", - " change_sec = (df[time_col] - df.prior_time).divide(\n", - " np.timedelta64(1, 's'))\n", - " )\n", - " \n", - " df2 = (df.groupby(group_cols, \n", - " observed=True, group_keys=False)\n", - " .agg({\"change_meters\": \"sum\", \n", - " \"change_sec\": \"sum\"})\n", - " .reset_index()\n", - " )\n", - " \n", - " df2 = df2.assign(\n", - " speed_mph = (df2.change_meters.divide(df2.change_sec) * \n", - " rt_utils.MPH_PER_MPS)\n", - " )\n", - " \n", - " return df2" - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "id": "192d6fd1-c8c3-461f-a3fe-1c350fb6096c", - "metadata": {}, - "outputs": [], - "source": [ - "from shared_utils import rt_utils\n", - "\n", - "speed = distance_and_seconds_elapsed(\n", - " linear_ref, \n", - " group_cols = [\"gtfs_dataset_key\", \"trip_id\"]\n", - ")" - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "id": "9cdf3484-20d8-499a-a3e4-41f7afae23a3", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "(68556, 5)" - ] - }, - "execution_count": 9, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "speed.shape" - ] - }, - { - "cell_type": "code", - "execution_count": 10, - "id": "f28f722f-b481-4543-ae5c-f678e69b30a2", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "(20, 5)" - ] - }, - "execution_count": 10, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "speed[speed.speed_mph>=70].shape" - ] - }, - { - "cell_type": "code", - "execution_count": 13, - "id": "1346a085-dc35-4792-b487-71dd0b559d80", - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "(4378, 5)" - ] - }, - "execution_count": 13, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "speed[speed.speed_mph<=2].shape" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "4420a0f0-c63d-4d24-9e80-919a7fe32744", - "metadata": {}, - "outputs": [], - "source": [ - "def aggregate_by_operator_route_time_of_day():" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "6ca60770-45c1-4aab-b0e9-8b1830d9250e", - "metadata": {}, - "outputs": [], - "source": [ - "#test_key = \"00accf770009aafd5dc103ff2eeddb37\"\n", - "#test_trip = \"t_1995375_b_33395_tn_0\"\n", - "test_shape = \"70f010e0dba18191937ed4b5bea42e8a\"" - ] - }, - { - "cell_type": "markdown", - "id": "dd00a9e9-f60a-4cff-9870-b8d93b763a7d", - "metadata": {}, - "source": [ - "This trip has a lot of vp that end up not being joined to any segment.\n", - "Including those vp far away from the shape mean that the interpolation results show the same thing, because essentially, all those points fall closest to the one end of the shape, and when taking the difference in `shape_meters`, the difference is zero.\n", - "\n", - "This is a compelling reason to add the % of segments touched in the sjoin results. Before, we used time cutoff, because it's easier to implement. '\n", - "\n", - "At least for calculating trip average speeds, we do need to touch at least 50% of the segments, or even 70% of segments as recommended in notebook, to only calculate entire trip averages on trips that have enough vp.\n", - "\n", - "The con of using % of segments is that it becomes even more crucial that segments are cut correctly. If we miss a segment (which we might, currently), there are vp that are not being joined, and we may throw out too many trips because it fails the % segments threshold.\n", - "\n", - "For now, let's take the sjoin results and use a couple points to triangulate the distance. Make an array, and pick points either every 10 min or at least 3 points to calculate distance." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "25017d03-f86a-4cc3-a5f6-584e6b647952", - "metadata": {}, - "outputs": [], - "source": [ - "ddf = A2.merge_usable_vp_with_sjoin_vpidx(\n", - " [test_shape],\n", - " USABLE_FILE,\n", - " SJOIN_FILE,\n", - " SEGMENT_IDENTIFIER_COLS,\n", - " GROUPING_COL\n", - ")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "c9ecb54b-6a8c-4b43-abcd-40d652ac92cd", - "metadata": {}, - "outputs": [], - "source": [ - "ddf = ddf.compute()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "f5bc03d7-5ebf-4100-bdc0-250876f1c04e", - "metadata": {}, - "outputs": [], - "source": [ - "from shared_utils import geography_utils\n", - "\n", - "ddf = geography_utils.create_point_geometry(ddf, \"x\", \"y\")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "f940224a-57e1-4d67-a0d7-8706733aa9fd", - "metadata": {}, - "outputs": [], - "source": [ - "crosswalk = sched_rt_utils.crosswalk_scheduled_trip_grouping_with_rt_key(\n", - " analysis_date, \n", - " [\"feed_key\", \"trip_id\", GROUPING_COL, \"shape_id\"] \n", - " )" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "50cc7daf-b044-4add-9f4e-85424e7b514c", - "metadata": {}, - "outputs": [], - "source": [ - "shapes = helpers.import_scheduled_shapes(\n", - " analysis_date,\n", - " columns = [\"shape_array_key\", \"geometry\"],\n", - " filters = [[(\"shape_array_key\", \"in\", [test_shape])]],\n", - " get_pandas = True,\n", - " crs = PROJECT_CRS\n", - ")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "c9ac9777-0fd1-4cb7-90bf-1718a7e75e93", - "metadata": {}, - "outputs": [], - "source": [ - "shapes2 = pd.merge(\n", - " shapes,\n", - " crosswalk,\n", - " on = \"shape_array_key\",\n", - " how = \"inner\"\n", - ")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "785628db-70b1-4152-a507-d35cb610f29c", - "metadata": {}, - "outputs": [], - "source": [ - "ddf2 = ddf.to_crs(PROJECT_CRS).drop(\n", - " columns = [\"location_timestamp\", \"location_timestamp_local\", \n", - " \"activity_date\"])" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "956f499e-475f-4427-9338-c9c6062d97af", - "metadata": {}, - "outputs": [], - "source": [ - "m = ddf2.explore(\"trip_id\", tiles = \"CartoDB Positron\")\n", - "m = shapes2.explore(m=m, color=\"yellow\", name=\"shape\")\n", - "folium.LayerControl().add_to(m)\n", - "m" - ] - }, - { - "cell_type": "markdown", - "id": "1450a48c-47f8-429a-b92b-b4b73f6893a9", - "metadata": {}, - "source": [ - "## Triangulate vp based on sjoin results" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "2a377e7e-e6c6-489f-b035-f4937622b3a8", - "metadata": {}, - "outputs": [], - "source": [ - "def list_of_vp_by_trip(\n", - " df: pd.DataFrame, \n", - " group_cols: list = [\"gtfs_dataset_key\", \"trip_id\"]\n", - ") -> pd.DataFrame:\n", - "\n", - " df2 = (df.groupby(trip_cols, observed=True)\n", - " .agg({\"vp_idx\": list})\n", - " .reset_index()\n", - " )\n", - " \n", - " return df2" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "2f294723-e7c9-46aa-bea1-7ea8397b781b", - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "20d94c32-2b9d-448b-bfa4-d4834f923dd4", - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "4f475c8d-caae-4a13-a3ef-a1d7a8943752", - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "e2e2ab41-148b-44e4-976f-1a5c836405ec", - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "2cd529e8-4dca-446b-a540-236e99d6f78d", - "metadata": {}, - "outputs": [], - "source": [ - "by_trip_ddfs = [list_of_vp_by_trip(df, trip_cols) for df in subset_vp_ddfs]" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "6c59093a-5f29-408b-a460-fcea81d44c0c", - "metadata": {}, - "outputs": [], - "source": [ - "one = by_trip_ddfs[0]" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "c70056d1-7a0b-4b59-a5c3-d84bc328b4e6", - "metadata": {}, - "outputs": [], - "source": [ - "trip_df = compute(one)[0]" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "c4df9dfb-5e12-4e16-8078-ed8e8be1c95d", - "metadata": {}, - "outputs": [], - "source": [ - "def count_vp_and_get_every_10_min(my_list: list):\n", - " vp_idx_arr = np.asarray(my_list)\n", - " subset_arr = vp_idx_arr[::30]\n", - " \n", - " if len(subset_arr) < 3:\n", - " subset_arr = vp_idx_arr[:15]\n", - " \n", - " return list(subset_arr)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "d674a955-b843-4045-9f85-47293aadecaa", - "metadata": {}, - "outputs": [], - "source": [ - "trip_df = trip_df.assign(\n", - " vp_idx2 = trip_df.apply(\n", - " lambda x: \n", - " count_vp_and_get_every_10_min(x.vp_idx), \n", - " axis=1, meta=('vp_idx2', 'object'))\n", - ")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "6beb7014-d275-4a7e-92ff-ad94787602c3", - "metadata": {}, - "outputs": [], - "source": [ - "keep_subset_vp = trip_df.vp_idx2.explode()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "8bf189ec-f3c5-4e99-8879-531fecca531d", - "metadata": {}, - "outputs": [], - "source": [ - "ddf_subset = ddf[ddf.vp_idx.isin(keep_subset_vp)][\n", - " [\"gtfs_dataset_key\", \"trip_id\",\n", - " \"location_timestamp_local\",\n", - " \"x\", \"y\", \"vp_idx\"]]" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "747033ce-549a-4dc7-9877-b3c758a6d692", - "metadata": {}, - "outputs": [], - "source": [ - "crosswalk = sched_rt_utils.crosswalk_scheduled_trip_grouping_with_rt_key(\n", - " analysis_date, \n", - " [\"feed_key\", \"trip_id\", GROUPING_COL]\n", - " )" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "e4ca3a07-1246-41a9-baa9-666e01f6c8dd", - "metadata": {}, - "outputs": [], - "source": [ - "subset_vp_shape = delayed(dd.merge)(\n", - " ddf_subset,\n", - " crosswalk,\n", - " on = [\"gtfs_dataset_key\", \"trip_id\"],\n", - " how = \"inner\"\n", - ").drop_duplicates()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "95b33d2c-7991-44af-b087-7720ad90762a", - "metadata": {}, - "outputs": [], - "source": [ - "subset_shapes = subset_vp_shape.shape_array_key.unique().persist()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "3034219b-99d2-4f64-bcb4-f6082e32760e", - "metadata": {}, - "outputs": [], - "source": [ - "subset_shapes" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "2a5214d5-6523-4eeb-800f-dc4088aa7a13", - "metadata": {}, - "outputs": [], - "source": [ - "shapes = helpers.import_scheduled_shapes(\n", - " analysis_date,\n", - " columns = [\"shape_array_key\", \"geometry\"],\n", - " filters = [[(\"shape_array_key\", \"in\", subset_shapes)]],\n", - " get_pandas = True,\n", - " crs = PROJECT_CRS\n", - ")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "f0c78935-f7ab-4d78-9261-fdacf96e8abe", - "metadata": {}, - "outputs": [], - "source": [ - "RT_OPERATORS = subset_vp_shape.gtfs_dataset_key.unique().compute()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "9155289c-35bc-4bdc-bfe6-a67b670309dc", - "metadata": {}, - "outputs": [], - "source": [ - "test_operator = RT_OPERATORS[0]" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "18b2632f-c9da-4161-9efc-dc7d773cb3c1", - "metadata": {}, - "outputs": [], - "source": [ - "subset_vp_operator = subset_vp_shape[\n", - " subset_vp_shape.gtfs_dataset_key==test_operator]" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "d2c209eb-b7d5-40ac-b208-6be674e59308", - "metadata": {}, - "outputs": [], - "source": [ - "linear_ref_operator = delayed(\n", - " wrangle_shapes.linear_reference_vp_against_segment)(\n", - " subset_vp_operator,\n", - " shapes,\n", - " segment_identifier_cols = [GROUPING_COL]\n", - ")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "28593d93-f10e-4b55-8113-2a4b96a0664a", - "metadata": {}, - "outputs": [], - "source": [ - "linear_ref = delayed(wrangle_shapes.linear_reference_vp_against_segment)(\n", - " subset_vp_shape,\n", - " shapes,\n", - " segment_identifier_cols = [GROUPING_COL]\n", - ")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "71530c79-c2cf-4d54-8b14-fdd9d0810c26", - "metadata": {}, - "outputs": [], - "source": [ - "linear_ref" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "2b1a5e65-92ea-4687-af86-306efc54cd27", - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "9135590c-d7e2-4fc7-876b-88b9e99cf5a5", - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "e3defe7e-2f37-4b9e-b149-200433924255", - "metadata": {}, - "outputs": [], - "source": [ - "operators = dd.read_parquet(\n", - " f\"{SEGMENT_GCS}{INPUT_FILE}\", \n", - " columns = [\"gtfs_dataset_key\"]\n", - ").gtfs_dataset_key.unique().compute().tolist()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "010db624-33bc-4fe7-8f9f-f957ff183f8a", - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "87843b73-9963-4ad6-8239-af991b2fdb47", - "metadata": {}, - "outputs": [], - "source": [ - "subset_operators = operators[:2]\n", - "subset_operators" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "02d6ccf0-5f72-4502-a0a0-43da58b088e1", - "metadata": {}, - "outputs": [], - "source": [ - "ddf = dd.read_parquet(\n", - " f\"{SEGMENT_GCS}{INPUT_FILE}\", \n", - " filters = [[(\"gtfs_dataset_key\", \"in\", subset_operators)]],\n", - " columns = [\"vp_idx\"]\n", - ")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "bc95f22a-549e-4176-9ff5-4d2f0dbaac0d", - "metadata": {}, - "outputs": [], - "source": [ - "trip_cols = [\"gtfs_dataset_key\", \"trip_id\"]\n", - "hour_min_cols = [\"hour\", \"minute\"]" - ] - }, - { - "cell_type": "markdown", - "id": "5cb85263-ca3c-49ac-acc8-2b03bb173a9b", - "metadata": {}, - "source": [ - "## Pings per minute for service hours" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "22f194bd-5713-478e-b7ab-5634a6c86a53", - "metadata": {}, - "outputs": [], - "source": [ - "ddf = ddf.repartition(npartitions=5)\n", - "\n", - "ddf = ddf.assign(\n", - " minute = ddf.location_timestamp_local.dt.minute\n", - ")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "b26e222c-8c20-43f3-b706-4bf311a7fda8", - "metadata": {}, - "outputs": [], - "source": [ - "ddf.dtypes" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "de3ab88f-97b3-45d3-95ef-28a907a25d1f", - "metadata": {}, - "outputs": [], - "source": [ - "num_vp_pings = (ddf.groupby(trip_cols + hour_min_cols, observed=True)\n", - " [\"location_timestamp_local\"]\n", - " .count()\n", - " .dropna()\n", - " .reset_index()\n", - " .rename(columns = {\"location_timestamp_local\": \"num_pings\"})\n", - " )" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "4938392d-a2ee-473e-91b5-9805b0aceb14", - "metadata": {}, - "outputs": [], - "source": [ - "num_vp_pings = num_vp_pings.assign(\n", - " atleast2 = num_vp_pings.apply(\n", - " lambda x: 1 if x.num_pings >= 2\n", - " else 0, axis=1, meta=('atleast2', 'int8'))\n", - " ) " - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "0fcb4ff1-04f6-4870-9e67-4391be2508a3", - "metadata": {}, - "outputs": [], - "source": [ - "vp_pings = (num_vp_pings.groupby(trip_cols)\n", - " .agg({\n", - " \"hour\": \"size\",\n", - " \"atleast2\": \"sum\"})\n", - " .dropna()\n", - " .reset_index()\n", - " ).rename(columns = {\n", - " \"hour\": \"trip_min_elapsed\"})" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "813ec2d3-8cf8-4e44-96bd-405ad65c19a9", - "metadata": {}, - "outputs": [], - "source": [ - "vp_pings = vp_pings.persist()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "id": "b427d9db-0c0e-43b9-9b82-464dd923d3e0", - "metadata": {}, - "outputs": [], - "source": [ - "vp_pings.compute()" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3 (ipykernel)", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.9.13" - } - }, - "nbformat": 4, - "nbformat_minor": 5 -}