diff --git a/_shared_utils/shared_utils/rt_dates.py b/_shared_utils/shared_utils/rt_dates.py index bcafbfb74..73256442e 100644 --- a/_shared_utils/shared_utils/rt_dates.py +++ b/_shared_utils/shared_utils/rt_dates.py @@ -64,6 +64,13 @@ "jul2024": "2024-07-17", "aug2024": "2024-08-14", "sep2024": "2024-09-18", + "oct2024a": "2024-10-14", + "oct2024b": "2024-10-15", + "oct2024": "2024-10-16", + "oct2024c": "2024-10-17", + "oct2024d": "2024-10-18", + "oct2024e": "2024-10-19", + "oct2024f": "2024-10-20", } y2023_dates = [ @@ -73,7 +80,7 @@ y2024_dates = [v for k, v in DATES.items() if k.endswith("2024")] -valid_weeks = ["apr2023", "oct2023", "apr2024"] +valid_weeks = ["apr2023", "oct2023", "apr2024", "oct2024"] def get_week(month: Literal[[*valid_weeks]], exclude_wed: bool) -> list: @@ -86,6 +93,7 @@ def get_week(month: Literal[[*valid_weeks]], exclude_wed: bool) -> list: apr2023_week = get_week(month="apr2023", exclude_wed=False) oct2023_week = get_week(month="oct2023", exclude_wed=False) apr2024_week = get_week(month="apr2024", exclude_wed=False) +oct2024_week = get_week(month="oct2024", exclude_wed=False) MONTH_DICT = { 1: "January", diff --git a/gtfs_funnel/logs/download_data.log b/gtfs_funnel/logs/download_data.log index 938fbc549..182945c8c 100644 --- a/gtfs_funnel/logs/download_data.log +++ b/gtfs_funnel/logs/download_data.log @@ -533,3 +533,35 @@ 2024-09-19 08:17:15.855 | INFO | __main__:download_one_day:33 - *********** Download st data *********** 2024-09-19 08:19:06.258 | INFO | __main__:download_one_day:56 - execution time: 0:01:52.036660 2024-09-19 09:28:35.882 | INFO | __main__:download_one_year:35 - execution time: 0:00:45.388883 +2024-10-17 19:48:08.455 | INFO | __main__:download_one_day:45 - Analysis date: 2024-10-14 +2024-10-17 19:48:10.847 | INFO | __main__:download_one_day:52 - # operators to run: 220 +2024-10-17 19:48:10.847 | INFO | __main__:download_one_day:56 - *********** Download trips data *********** +2024-10-17 19:48:42.107 | INFO | __main__:download_one_day:86 - execution time: 0:00:33.631682 +2024-10-17 19:48:42.297 | INFO | __main__:download_one_day:45 - Analysis date: 2024-10-15 +2024-10-17 19:48:44.148 | INFO | __main__:download_one_day:52 - # operators to run: 220 +2024-10-17 19:48:44.150 | INFO | __main__:download_one_day:56 - *********** Download trips data *********** +2024-10-17 19:49:14.779 | INFO | __main__:download_one_day:86 - execution time: 0:00:32.481154 +2024-10-17 19:49:33.224 | INFO | __main__:download_one_day:22 - Analysis date: 2024-10-14 +2024-10-17 19:49:34.939 | INFO | __main__:download_one_day:29 - # operators to run: 220 +2024-10-17 19:49:34.940 | INFO | __main__:download_one_day:33 - *********** Download stops data *********** +2024-10-17 19:49:45.003 | INFO | __main__:download_one_day:64 - execution time: 0:00:11.778543 +2024-10-17 19:49:45.047 | INFO | __main__:download_one_day:22 - Analysis date: 2024-10-15 +2024-10-17 19:49:46.476 | INFO | __main__:download_one_day:29 - # operators to run: 220 +2024-10-17 19:49:46.477 | INFO | __main__:download_one_day:33 - *********** Download stops data *********** +2024-10-17 19:49:56.983 | INFO | __main__:download_one_day:64 - execution time: 0:00:11.935309 +2024-10-17 19:50:15.683 | INFO | __main__:download_one_day:22 - Analysis date: 2024-10-14 +2024-10-17 19:50:17.694 | INFO | __main__:download_one_day:29 - # operators to run: 220 +2024-10-17 19:50:17.695 | INFO | __main__:download_one_day:33 - *********** Download routelines data *********** +2024-10-17 19:52:27.566 | INFO | __main__:download_one_day:63 - execution time: 0:02:11.882708 +2024-10-17 19:52:27.631 | INFO | __main__:download_one_day:22 - Analysis date: 2024-10-15 +2024-10-17 19:52:29.725 | INFO | __main__:download_one_day:29 - # operators to run: 220 +2024-10-17 19:52:29.726 | INFO | __main__:download_one_day:33 - *********** Download routelines data *********** +2024-10-17 19:54:42.349 | INFO | __main__:download_one_day:63 - execution time: 0:02:14.717313 +2024-10-17 19:54:59.527 | INFO | __main__:download_one_day:21 - Analysis date: 2024-10-14 +2024-10-17 19:55:01.219 | INFO | __main__:download_one_day:29 - # operators to run: 183 +2024-10-17 19:55:01.219 | INFO | __main__:download_one_day:33 - *********** Download st data *********** +2024-10-17 19:56:58.579 | INFO | __main__:download_one_day:56 - execution time: 0:01:59.050949 +2024-10-17 19:56:59.929 | INFO | __main__:download_one_day:21 - Analysis date: 2024-10-15 +2024-10-17 19:57:01.448 | INFO | __main__:download_one_day:29 - # operators to run: 189 +2024-10-17 19:57:01.449 | INFO | __main__:download_one_day:33 - *********** Download st data *********** +2024-10-17 19:59:04.659 | INFO | __main__:download_one_day:56 - execution time: 0:02:04.728848 diff --git a/gtfs_funnel/logs/download_vp_v2.log b/gtfs_funnel/logs/download_vp_v2.log index e81bf94b5..726fc329a 100644 --- a/gtfs_funnel/logs/download_vp_v2.log +++ b/gtfs_funnel/logs/download_vp_v2.log @@ -350,3 +350,25 @@ 2024-09-19 08:33:43.251 | INFO | __main__::112 - export concatenated vp: 0:04:05.069147 2024-09-19 08:37:30.865 | INFO | __main__::134 - remove batched parquets 2024-09-19 08:37:30.865 | INFO | __main__::137 - execution time: 0:08:10.892310 +2024-10-17 19:59:24.445 | INFO | __main__::148 - Analysis date: 2024-10-14 +2024-10-17 20:01:27.918 | INFO | __main__:loop_through_batches_and_download_vp:111 - exported batch 0 to GCS: 0:02:03.438786 +2024-10-17 20:02:40.507 | INFO | __main__:loop_through_batches_and_download_vp:111 - exported batch 1 to GCS: 0:01:12.588438 +2024-10-17 20:06:47.856 | INFO | __main__:loop_through_batches_and_download_vp:111 - exported batch 2 to GCS: 0:04:07.347856 +2024-10-17 20:08:36.666 | INFO | __main__:loop_through_batches_and_download_vp:111 - exported batch 3 to GCS: 0:01:48.808560 +2024-10-17 20:08:36.666 | INFO | __main__::155 - execution time: 0:09:12.186603 +2024-10-17 20:08:36.667 | INFO | __main__::148 - Analysis date: 2024-10-15 +2024-10-17 20:10:56.539 | INFO | __main__:loop_through_batches_and_download_vp:111 - exported batch 0 to GCS: 0:02:19.871706 +2024-10-17 20:12:13.012 | INFO | __main__:loop_through_batches_and_download_vp:111 - exported batch 1 to GCS: 0:01:16.472618 +2024-10-17 20:16:18.595 | INFO | __main__:loop_through_batches_and_download_vp:111 - exported batch 2 to GCS: 0:04:05.582366 +2024-10-17 20:18:25.253 | INFO | __main__:loop_through_batches_and_download_vp:111 - exported batch 3 to GCS: 0:02:06.656799 +2024-10-17 20:18:25.253 | INFO | __main__::155 - execution time: 0:09:48.586216 +2024-10-17 20:18:43.093 | INFO | __main__::97 - Analysis date: 2024-10-14 +2024-10-17 20:18:50.762 | INFO | __main__::105 - concat and filter batched data: 0:00:07.668354 +2024-10-17 20:22:34.818 | INFO | __main__::112 - export concatenated vp: 0:03:44.055977 +2024-10-17 20:26:21.567 | INFO | __main__::134 - remove batched parquets +2024-10-17 20:26:21.568 | INFO | __main__::137 - execution time: 0:07:38.474288 +2024-10-17 20:26:23.338 | INFO | __main__::97 - Analysis date: 2024-10-15 +2024-10-17 20:26:29.465 | INFO | __main__::105 - concat and filter batched data: 0:00:05.953508 +2024-10-17 20:30:09.125 | INFO | __main__::112 - export concatenated vp: 0:03:39.660077 +2024-10-17 20:33:58.275 | INFO | __main__::134 - remove batched parquets +2024-10-17 20:33:58.275 | INFO | __main__::137 - execution time: 0:07:34.764458 diff --git a/gtfs_funnel/logs/vp_preprocessing.log b/gtfs_funnel/logs/vp_preprocessing.log index 7b9dddf71..abb99c591 100644 --- a/gtfs_funnel/logs/vp_preprocessing.log +++ b/gtfs_funnel/logs/vp_preprocessing.log @@ -211,3 +211,25 @@ 2024-09-19 09:03:13.200 | INFO | __main__::235 - vp with dwell time 2024-09-18: 0:05:59.311280 2024-09-19 09:08:43.742 | INFO | __main__::120 - 2024-09-18: condense vp for trip 0:05:09.575132 2024-09-19 09:20:16.936 | INFO | __main__::128 - 2024-09-18: prepare vp to use in nearest neighbor: 0:11:33.194871 +2024-10-17 20:49:37.343 | INFO | __main__::169 - 2024-10-14: pare down vp: 0:02:16.208887 +2024-10-17 20:52:10.568 | INFO | __main__::169 - 2024-10-15: pare down vp: 0:02:33.177584 +2024-10-17 20:56:36.196 | INFO | __main__:attach_prior_vp_add_direction:90 - persist vp gddf: 0:04:08.937551 +2024-10-17 21:00:39.569 | INFO | __main__:attach_prior_vp_add_direction:122 - np vectorize arrays for direction: 0:04:03.372933 +2024-10-17 21:00:46.826 | INFO | __main__::194 - 2024-10-14: export vp direction: 0:08:19.567745 +2024-10-17 21:02:15.636 | INFO | __main__::200 - 2024-10-14: export usable vp with direction: 0:01:28.809937 +2024-10-17 21:02:15.637 | INFO | __main__::203 - 2024-10-14: vp_direction script execution time: 0:09:48.377682 +2024-10-17 21:06:33.984 | INFO | __main__:attach_prior_vp_add_direction:90 - persist vp gddf: 0:04:18.346553 +2024-10-17 21:10:43.386 | INFO | __main__:attach_prior_vp_add_direction:122 - np vectorize arrays for direction: 0:04:09.402165 +2024-10-17 21:10:50.943 | INFO | __main__::194 - 2024-10-15: export vp direction: 0:08:35.305648 +2024-10-17 21:12:18.037 | INFO | __main__::200 - 2024-10-15: export usable vp with direction: 0:01:27.093838 +2024-10-17 21:12:18.040 | INFO | __main__::203 - 2024-10-15: vp_direction script execution time: 0:10:02.399486 +2024-10-17 21:17:25.614 | INFO | __main__::213 - compute dwell df: 0:04:31.222087 +2024-10-17 21:18:45.449 | INFO | __main__::235 - merge with original and export: 0:01:19.834575 +2024-10-17 21:18:45.451 | INFO | __main__::236 - vp with dwell time 2024-10-14: 0:05:51.056662 +2024-10-17 21:24:09.014 | INFO | __main__::213 - compute dwell df: 0:05:23.562191 +2024-10-17 21:25:22.669 | INFO | __main__::235 - merge with original and export: 0:01:13.654913 +2024-10-17 21:25:22.671 | INFO | __main__::236 - vp with dwell time 2024-10-15: 0:06:37.217104 +2024-10-17 21:31:14.849 | INFO | __main__::120 - 2024-10-14: condense vp for trip 0:05:34.574524 +2024-10-17 21:42:43.893 | INFO | __main__::128 - 2024-10-14: prepare vp to use in nearest neighbor: 0:11:29.044763 +2024-10-17 21:47:57.273 | INFO | __main__::120 - 2024-10-15: condense vp for trip 0:05:13.379949 +2024-10-17 21:59:50.020 | INFO | __main__::128 - 2024-10-15: prepare vp to use in nearest neighbor: 0:11:52.747009 diff --git a/gtfs_funnel/update_vars.py b/gtfs_funnel/update_vars.py index 02828ae30..a95dec86a 100644 --- a/gtfs_funnel/update_vars.py +++ b/gtfs_funnel/update_vars.py @@ -1,5 +1,6 @@ from shared_utils import catalog_utils, rt_dates +oct2024_week = rt_dates.get_week("oct2024", exclude_wed=True) apr2024_week = rt_dates.get_week("apr2024", exclude_wed=True) oct2023_week = rt_dates.get_week("oct2023", exclude_wed=True) apr2023_week = rt_dates.get_week("apr2023", exclude_wed=True) @@ -11,7 +12,7 @@ ) -analysis_date_list = [rt_dates.DATES["sep2024"]] +analysis_date_list = [rt_dates.DATES["oct2024"]] GTFS_DATA_DICT = catalog_utils.get_catalog("gtfs_analytics_data") diff --git a/open_data/Makefile b/open_data/Makefile index c8376836b..3d68717e5 100644 --- a/open_data/Makefile +++ b/open_data/Makefile @@ -8,8 +8,11 @@ compile_open_data_portal: python gcs_to_esri.py #update metadata.yml (add new datasets here) python supplement_meta.py # run if any changes are made to yml - #python arcgis_script_pro.py #(in ESRI!) python update_data_dict.py # check if columns are missing in data_dictionary yml python update_fields_fgdc.py # populate fields with data dictionary yml values, run if update_data_dict had changes to incorporate + # Download the zipped shapefiles and metadata.yml and move to local ESRI directory + #python arcgis_script_pro.py #(in ESRI!) python metadata_update_pro.py # go back into ESRI and update xml + # Download the overwritten XML files in xml/run_in_esri/ and move to local ESRI directory. + #python arcgis_script_pro.py #(in ESRI!) python cleanup.py # run after ESRI work done \ No newline at end of file diff --git a/open_data/arcgis_pro_notebook_sample.ipynb b/open_data/arcgis_pro_notebook_sample.ipynb new file mode 100644 index 000000000..cbbd6cff6 --- /dev/null +++ b/open_data/arcgis_pro_notebook_sample.ipynb @@ -0,0 +1,477 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "scrolled": true + }, + "outputs": [], + "source": [ + "import os\n", + "import arcpy\n", + "import json\n", + "\n", + "from arcpy import metadata as md\n", + "S_NUMBER = \"\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "scrolled": true + }, + "outputs": [], + "source": [ + "arcpy.env.workspace = os.path.join(\n", + " \"C:\\\\\", \"Users\", S_NUMBER, \n", + " \"Documents\", \"ArcGIS\"\n", + ")\n", + "working_dir = arcpy.env.workspace\n", + "working_dir" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "scrolled": true + }, + "outputs": [], + "source": [ + "directory = arcpy.GetInstallInfo(\"desktop\")[\"InstallDir\"] \n", + "directory" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "scrolled": true + }, + "outputs": [], + "source": [ + "# Set datasets to update...match to `update_vars.RUN_ME`\n", + "in_features = [\n", + " 'ca_hq_transit_areas',\n", + " 'ca_hq_transit_stops',\n", + " 'ca_transit_routes',\n", + " 'ca_transit_stops',\n", + " 'speeds_by_stop_segments',\n", + " 'speeds_by_route_time_of_day'\n", + "]\n", + "\n", + "staging_location = 'staging.gdb'\n", + "out_location = 'open_data.gdb'\n", + "\n", + "def feature_class_in_gdb_path(my_gdb, file_name):\n", + " return os.path.join(my_gdb, file_name)\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Unzip zipped shapefiles, download metadata.json into local path\n", + "\n", + "### Set FGDC field defs for each dataset and export XML (do once when new dataset added)\n", + "\n", + "Only the FGDC standard keeps fields.\n", + "See if we can use this and combine it with our ISO 19139 standard later." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Read in json with all the changes we need for each layer\n", + "with open(f\"{working_dir}\\metadata.json\") as f:\n", + " meta_dict = json.load(f)\n", + "\n", + " \n", + "def update_metadata_class(this_feature_class, meta_dict_for_dataset: dict):\n", + " \"\"\"\n", + " Update the elements in the arcpy.metadata class.\n", + " \"\"\"\n", + " # Now update metadata class elements that are available\n", + " source_metadata = md.Metadata(this_feature_class)\n", + "\n", + " source_metadata.title = meta_dict_for_dataset[\"dataset_name\"]\n", + " source_metadata.tags = meta_dict_for_dataset[\"theme_keywords\"]\n", + " source_metadata.summary = meta_dict_for_dataset[\"summary_purpose\"]\n", + " source_metadata.description = meta_dict_for_dataset[\"description\"]\n", + " source_metadata.accessConstraints = meta_dict_for_dataset[\"public_access\"]\n", + " source_metadata.save()\n", + " \n", + " return" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "scrolled": true + }, + "outputs": [], + "source": [ + "def export_fgdc_metadata(one_feature_class):\n", + " \"\"\"\n", + " Export XML as FGDC format, \n", + " that's the only one that keeps field names and definitions\n", + " available.\n", + " \"\"\"\n", + " this_feature_class = feature_class_in_gdb_path(\n", + " staging_location, \n", + " one_feature_class\n", + " )\n", + " \n", + " subset_meta_dict = meta_dict[one_feature_class]\n", + "\n", + " update_metadata_class(this_feature_class, subset_meta_dict)\n", + " \n", + " source_metadata = md.Metadata(this_feature_class)\n", + " \n", + " # Export metadata XML in FGDC \n", + " meta_output = os.path.join(working_dir, \n", + " f\"./{one_feature_class}_fgdc.xml\")\n", + " \n", + " TRANSLATOR = \"FGDC_CSDGM\" \n", + "\n", + " source_metadata.exportMetadata(\n", + " outputPath = meta_output, \n", + " metadata_export_option = TRANSLATOR\n", + " )\n", + " print(f\"Exported FGDC XML for {one_feature_class}\")\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Do field data dictionary updates in Jupyter Hub\n", + "### Use shapefile and write it to a file gdb layer" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "scrolled": true + }, + "outputs": [], + "source": [ + "# Clean up last run (if applicable)\n", + "for f in in_features:\n", + " feature_path = f\"{working_dir}\\{f}.xml\"\n", + " if os.path.exists(feature_path):\n", + " os.remove(feature_path)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "scrolled": true + }, + "outputs": [], + "source": [ + "def shp_to_feature_class(file_name: str):\n", + " \"\"\"\n", + " From shapefile (directory of files), unpack those\n", + " and write it to our staging gdb as a feature class.\n", + " \"\"\"\n", + " # construct the filename, which is takes form of routes_assembled/routes_assembled.shp\n", + " shp_file_name = f\"{os.path.join(file_name, f'{file_name}.shp')}\"\n", + " \n", + " this_feature_class = os.path.join(staging_location, file_name)\n", + " \n", + " if arcpy.Exists(this_feature_class): \n", + " arcpy.management.Delete(this_feature_class)\n", + "\n", + " # Execute FeatureClassToGeodatabase\n", + " arcpy.FeatureClassToGeodatabase_conversion(\n", + " shp_file_name, \n", + " staging_location\n", + " )\n", + " \n", + " # Print field names, just in case it needs renaming\n", + " # get a list of fields for each feature class\n", + " field_list = arcpy.ListFields(this_feature_class) \n", + " \n", + " print(this_feature_class)\n", + " for field in field_list: \n", + " print(field.name)\n", + " \n", + " return\n", + "\n", + "\n", + "for f in in_features:\n", + " shp_to_feature_class(f)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "scrolled": true + }, + "outputs": [], + "source": [ + "def rename_columns_with_dict(this_feature_class, rename_dict: dict):\n", + " \"\"\"\n", + " Get a list of fields for each feature class and use a dict to rename.\n", + " To change field names, must use AlterField_management, \n", + " because changing it in XML won't carry through when you sync\n", + " \"\"\"\n", + " field_list = arcpy.ListFields(this_feature_class) \n", + "\n", + " for field in field_list: \n", + " if field.name in rename_dict: \n", + " arcpy.AlterField_management(\n", + " this_feature_class, \n", + " field.name, \n", + " rename_dict[field.name], # new_field_name\n", + " rename_dict[field.name] # new_field_alias\n", + " ) \n", + " return" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "scrolled": true + }, + "outputs": [], + "source": [ + "def update_feature_class_with_json(one_feature_class, meta_json_dict: dict):\n", + " \"\"\"\n", + " Update a single feature class.\n", + " Rename columns, apply FGDC metadata fields \n", + " template, and update metadata class attributes\n", + " that can be accessed through the arcpy.metadata class.\n", + " \"\"\"\n", + " this_feature_class = feature_class_in_gdb_path(\n", + " staging_location, \n", + " one_feature_class\n", + " )\n", + " \n", + " subset_meta_dict = meta_json_dict[one_feature_class]\n", + " \n", + " if \"rename_cols\" in subset_meta_dict.keys(): \n", + " rename_dict = subset_meta_dict[\"rename_cols\"]\n", + "\n", + " rename_columns_with_dict(this_feature_class, rename_dict)\n", + " \n", + " # Check that renaming is done\n", + " print(this_feature_class)\n", + " check_fields = arcpy.ListFields(this_feature_class)\n", + " for field in check_fields:\n", + " print(field.name)\n", + " \n", + " # Sync with FGDC metadata \n", + " # (this is on the one_feature_class, which sits outside of staging/)\n", + " #import_fgdc_metadata_and_sync(one_feature_class)\n", + " \n", + " # Now update the rest of the metadata elements\n", + " update_metadata_class(this_feature_class, subset_meta_dict)\n", + "\n", + " return\n", + "\n", + " \n", + "for f in in_features:\n", + " update_feature_class_with_json(f, meta_dict)\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "'''\n", + "# if there are updates to data_dictionary.yml, this needs to be run\n", + "# so fields reflect new definitions.\n", + "for f in in_features:\n", + " export_fgdc_metadata(f)\n", + "'''" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "for f in in_features:\n", + " this_feature_class = feature_class_in_gdb_path(staging_location, f)\n", + "\n", + " # Original metadata\n", + " # Migrating to Pro: https://pro.arcgis.com/en/pro-app/latest/arcpy/metadata/migrating-from-arcmap-to-arcgis-pro.htm\n", + "\n", + " source_metadata = md.Metadata(this_feature_class)\n", + " # Export metadata XML \n", + " meta_output = os.path.join(working_dir, f\"{f}.xml\")\n", + " \n", + " # In ArcGIS Pro, instead of FGDC for Desktop, use ISO 19139 GML 3.2\n", + " # https://sv03tmcpo.ct.dot.ca.gov/portal/apps/sites/#/geep/pages/open-data-request\n", + " TRANSLATOR = \"ISO19139_GML32\" \n", + " \n", + " source_metadata.exportMetadata(\n", + " outputPath = meta_output, \n", + " metadata_export_option = TRANSLATOR\n", + " )\n", + " \n", + " print(f\"successful export: {f}\")\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Update XML in JupyterHub\n", + "\n", + "Run `python metadata_update_pro.py`\n", + "\n", + "### Import FGDC metadata for each dataset manually\n", + "The button to Metadata > Import > type of metadata set to FGDC does something different than the `metadata.importMetadata` feature, which doesn't do it. Manually doing the import for the fgdb metadata works for each dataset only.\n", + "\n", + "Do this FGDC metadata first to get the field descriptions populated. If we do this second, certain items in the metadata will get overwritten and set to blank.\n", + "\n", + "Somewhere once FGDC applied first, it erases the tags we included. Sad.\n", + "\n", + "### With new XML, finish up workflow" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Write layers to open_data gdb" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "scrolled": true + }, + "outputs": [], + "source": [ + "# Write layers to open_data (with the overwritten and updated XML already)\n", + "def write_feature_class_to_open_data(\n", + " one_feature_class,\n", + " staging_gdb = staging_location, \n", + " output_gdb = out_location, \n", + "):\n", + " \"\"\"\n", + " Move the feature class from the staging gdb to the output gdb.\n", + " Delete the feature class in the output gdb because\n", + " we don't want _1 appended to the end\n", + " \"\"\"\n", + " staging_feature_class = feature_class_in_gdb_path(\n", + " staging_gdb, \n", + " one_feature_class\n", + " )\n", + " out_feature_class = feature_class_in_gdb_path(\n", + " output_gdb, \n", + " one_feature_class\n", + " )\n", + " \n", + " if arcpy.Exists(out_feature_class): \n", + " arcpy.management.Delete(out_feature_class)\n", + "\n", + " # Copy over the feature class from staging.gdb to open_data.gdb\n", + " arcpy.conversion.FeatureClassToFeatureClass(\n", + " staging_feature_class, \n", + " output_gdb, \n", + " one_feature_class\n", + " )\n", + " \n", + " arcpy.conversion.FeatureClassToFeatureClass(\n", + " staging_feature_class, \n", + " output_gdb, \n", + " one_feature_class\n", + " )\n", + " \n", + " return\n", + " \n", + "\n", + "for f in in_features:\n", + " write_feature_class_to_open_data(f)\n", + " print(f\"in open_data.gdb: {f}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Exit and restart ArcPro to clear locks on layers in overwriting\n", + "\n", + "If we don't exit, the layer will be locked because it shows we're already using it (staging to open_data), and it will prevent writing from open_data to the enterprise gdb.\n", + "\n", + "License Select must be set to `Advanced` for this to work" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "scrolled": true + }, + "outputs": [], + "source": [ + "ENTERPRISE_DATABASE = \"Database Connections/HQrail(edit)@sv03tmcsqlprd1.sde\"" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "for f in in_features:\n", + " out_feature_class = feature_class_in_gdb_path(out_location, f)\n", + " \n", + " arcpy.FeatureClassToFeatureClass_conversion(\n", + " in_features = out_feature_class,\n", + " out_path = ENTERPRISE_DATABASE,\n", + " out_name = f\n", + " )" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "scrolled": true + }, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3 (ipykernel)", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.13" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/open_data/arcgis_pro_script.py b/open_data/arcgis_pro_script.py index 1bf13d3db..b5745d9ea 100644 --- a/open_data/arcgis_pro_script.py +++ b/open_data/arcgis_pro_script.py @@ -213,6 +213,7 @@ def update_feature_class_with_json(one_feature_class, meta_json_dict: dict): ### (4) UPDATE XML METADATA SEPARATELY IN PYTHON OUTSIDE OF ARCGIS IN JUPYTERHUB +# Run `python metadata_update_pro.py` ## Import FGDC metadata for each dataset manually # The button to Metadata > Import > type of metadata set to FGDC does something different than the `metadata.importMetadata` feature, which doesn't do it. Manually doing the import for the fgdb metadata works for each dataset only. diff --git a/open_data/metadata.json b/open_data/metadata.json index 6a455e678..a62a99df8 100644 --- a/open_data/metadata.json +++ b/open_data/metadata.json @@ -1 +1 @@ -{"ca_hq_transit_areas": {"dataset_name": "ca_hq_transit_areas", "publish_entity": "Data & Digital Services / California Integrated Travel Project", "summary_purpose": "Estimated High Quality Transit Areas as described in Public Resources Code 21155, 21064.3, 21060.2.", "description": "Use GTFS schedule trips, stop_times, shapes, and stops to estimate whether corridor segments have scheduled frequencies of 15 minutes or less.", "public_access": "Public.", "creation_date": "2022-02-08", "place": "California", "status": "completed", "frequency": "monthly", "theme_topic": "transportation", "theme_keywords": "Transportation, Land Use, Transit-Oriented Development, TOD, High Quality Transit", "data_dict_type": "XML", "readme": "https://github.com/cal-itp/data-analyses/blob/main/high_quality_transit_areas/README.md", "readme_desc": "This site allows you to access the code used to create this dataset and provides additional explanatory resources.", "contact_organization": "Caltrans", "contact_person": "Eric Dasmalchi", "contact_email": "eric.dasmalchi@dot.ca.gov", "horiz_accuracy": "4 meters", "boilerplate_desc": "The data are made available to the public solely for informational purposes. Information provided in the Caltrans GIS Data Library is accurate to the best of our knowledge and is subject to change on a regular basis, without notice. While Caltrans makes every effort to provide useful and accurate information, we do not warrant the information Use Limitation - The data are made available to the public solely for informational purposes. Information provided in the Caltrans GIS Data Library is accurate to the best of our knowledge and is subject to change on a regular basis, without notice. While Caltrans makes every effort to provide useful and accurate information, we do not warrant the information to be authoritative, complete, factual, or timely. Information is provided on an 'as is' and an 'as available' basis. The Department of Transportation is not liable to any party for any cost or damages, including any direct, indirect, special, incidental, or consequential damages, arising out of or in connection with the access or use of, or the inability to access or use, the Site or any of the Materials or Services described herein.", "boilerplate_license": "Creative Commons 4.0 Attribution.", "methodology": "This data was estimated using a spatial process derived from General Transit Feed Specification (GTFS) schedule data. To find high-quality bus corridors, we split each corridor into 1,500 meter segments and counted frequencies at the stop within that segment with the highest number of transit trips. If that stop saw at least 4 trips per hour for at least one hour in the morning, and again for at least one hour in the afternoon, we consider that segment a high-quality bus corridor. Segments without a stop are not considered high-quality corridors. Major transit stops were identified as either the intersection of two high-quality corridors from the previous step, a rail or bus rapid transit station, or a ferry terminal with bus service. Note that the definition of `bus rapid transit` in Public Resources Code 21060.2 includes features not captured by available data sources, these features were captured manually using information from transit agency sources and imagery. We believe this data to be broadly accurate, and fit for purposes including overall dashboards, locating facilities in relation to high quality transit areas, and assessing community transit coverage. However, the spatial determination of high-quality transit areas from GTFS data necessarily involves some assumptions as described above. Any critical determinations of whether a specific parcel is located within a high-quality transit area should be made in conjunction with local sources, such as transit agency timetables. Notes: Null values may be present. The `hqta_details` columns defines which part of the Public Resources Code definition the HQTA classification was based on. If `hqta_details` references a single operator, then `agency_secondary` and `base64_url_secondary` are null. If `hqta_details` references the same operator, then `agency_secondary` and `base64_url_secondary` are the same as `agency_primary` and `base64_url_primary`.", "data_dict_url": "https://gisdata.dot.ca.gov/arcgis/rest/services/CHrailroad/CA_HQ_Transit_Areas/FeatureServer", "revision_date": "2024-09-18", "rename_cols": {"agency_pri": "agency_primary", "agency_sec": "agency_secondary", "hqta_detai": "hqta_details", "base64_url": "base64_url_primary", "base64_u_1": "base64_url_secondary", "org_id_pri": "org_id_primary", "org_id_sec": "org_id_secondary"}}, "ca_hq_transit_stops": {"dataset_name": "ca_hq_transit_stops", "publish_entity": "Data & Digital Services / California Integrated Travel Project", "summary_purpose": "Estimated stops along High Quality Transit Corridors, plus major transit stops for bus rapid transit, ferry, rail modes as described in Public Resources Code 21155, 21064.3, 21060.2.", "description": "Use GTFS schedule trips, stop_times, shapes, and stops to estimate whether corridor segments have scheduled frequencies of 15 minutes or less.", "public_access": "Public.", "creation_date": "2022-02-08", "place": "California", "status": "completed", "frequency": "monthly", "theme_topic": "transportation", "theme_keywords": "Transportation, Land Use, Transit-Oriented Development, TOD, High Quality Transit", "data_dict_type": "XML", "readme": "https://github.com/cal-itp/data-analyses/blob/main/high_quality_transit_areas/README.md", "readme_desc": "This site allows you to access the code used to create this dataset and provides additional explanatory resources.", "contact_organization": "Caltrans", "contact_person": "Eric Dasmalchi", "contact_email": "eric.dasmalchi@dot.ca.gov", "horiz_accuracy": "4 meters", "boilerplate_desc": "The data are made available to the public solely for informational purposes. Information provided in the Caltrans GIS Data Library is accurate to the best of our knowledge and is subject to change on a regular basis, without notice. While Caltrans makes every effort to provide useful and accurate information, we do not warrant the information Use Limitation - The data are made available to the public solely for informational purposes. Information provided in the Caltrans GIS Data Library is accurate to the best of our knowledge and is subject to change on a regular basis, without notice. While Caltrans makes every effort to provide useful and accurate information, we do not warrant the information to be authoritative, complete, factual, or timely. Information is provided on an 'as is' and an 'as available' basis. The Department of Transportation is not liable to any party for any cost or damages, including any direct, indirect, special, incidental, or consequential damages, arising out of or in connection with the access or use of, or the inability to access or use, the Site or any of the Materials or Services described herein.", "boilerplate_license": "Creative Commons 4.0 Attribution.", "methodology": "This data was estimated using a spatial process derived from General Transit Feed Specification (GTFS) schedule data. To find high-quality bus corridors, we split each corridor into 1,500 meter segments and counted frequencies at the stop within that segment with the highest number of transit trips. If that stop saw at least 4 trips per hour for at least one hour in the morning, and again for at least one hour in the afternoon, we consider that segment a high-quality bus corridor. Segments without a stop are not considered high-quality corridors. Major transit stops were identified as either the intersection of two high-quality corridors from the previous step, a rail or bus rapid transit station, or a ferry terminal with bus service. Note that the definition of `bus rapid transit` in Public Resources Code 21060.2 includes features not captured by available data sources, these features were captured manually using information from transit agency sources and imagery. We believe this data to be broadly accurate, and fit for purposes including overall dashboards, locating facilities in relation to high quality transit areas, and assessing community transit coverage. However, the spatial determination of high-quality transit areas from GTFS data necessarily involves some assumptions as described above. Any critical determinations of whether a specific parcel is located within a high-quality transit area should be made in conjunction with local sources, such as transit agency timetables. Notes: Null values may be present. The `hqta_details` columns defines which part of the Public Resources Code definition the HQTA classification was based on. If `hqta_details` references a single operator, then `agency_secondary` and `base64_url_secondary` are null. If `hqta_details` references the same operator, then `agency_secondary` and `base64_url_secondary` are the same as `agency_primary` and `base64_url_primary`.", "data_dict_url": "https://gisdata.dot.ca.gov/arcgis/rest/services/CHrailroad/CA_HQ_Transit_Stops/FeatureServer", "revision_date": "2024-09-18", "rename_cols": {"agency_pri": "agency_primary", "agency_sec": "agency_secondary", "hqta_detai": "hqta_details", "base64_url": "base64_url_primary", "base64_u_1": "base64_url_secondary", "org_id_pri": "org_id_primary", "org_id_sec": "org_id_secondary"}}, "ca_transit_routes": {"dataset_name": "ca_transit_routes", "publish_entity": "Data & Digital Services / California Integrated Travel Project", "summary_purpose": "Provide all CA transit stops and routes (geospatial) from all transit operators.", "description": "Provide compiled GTFS schedule data in geospatial format. Transit routes associates route information to shapes. Transit stops associates aggregated stop times and route information aggregated for each stops.", "public_access": "Public.", "creation_date": "2022-02-08", "place": "California", "status": "completed", "frequency": "monthly", "theme_topic": "transportation", "theme_keywords": "Transportation, GTFS, Transit routes, Transit stops, Transit", "data_dict_type": "XML", "readme": "https://github.com/cal-itp/data-analyses/blob/main/open_data/README.md", "readme_desc": "This site allows you to access the code used to create this dataset and provides additional explanatory resources.", "contact_organization": "Caltrans", "contact_person": "Tiffany Ku", "contact_email": "tiffany.ku@dot.ca.gov", "horiz_accuracy": "4 meters", "boilerplate_desc": "The data are made available to the public solely for informational purposes. Information provided in the Caltrans GIS Data Library is accurate to the best of our knowledge and is subject to change on a regular basis, without notice. While Caltrans makes every effort to provide useful and accurate information, we do not warrant the information Use Limitation - The data are made available to the public solely for informational purposes. Information provided in the Caltrans GIS Data Library is accurate to the best of our knowledge and is subject to change on a regular basis, without notice. While Caltrans makes every effort to provide useful and accurate information, we do not warrant the information to be authoritative, complete, factual, or timely. Information is provided on an 'as is' and an 'as available' basis. The Department of Transportation is not liable to any party for any cost or damages, including any direct, indirect, special, incidental, or consequential damages, arising out of or in connection with the access or use of, or the inability to access or use, the Site or any of the Materials or Services described herein.", "boilerplate_license": "Creative Commons 4.0 Attribution.", "methodology": "This data was assembled from the General Transit Feed Specification (GTFS) schedule data. GTFS tables are text files, but these have been compiled for all operators and transformed into geospatial data, with minimal data processing. The transit routes dataset is assembled from two tables: (1) `shapes.txt`, which defines the route alignment path, and (2) `trips.txt` and `stops.txt`, for routes not found in `shapes.txt`. `shapes.txt` is an optional GTFS table with richer information than just transit stop longitude and latitude. The transit stops dataset is assembled from `stops.txt`, which contains information about the route, stop sequence, and stop longitude and latitude. References: https://gtfs.org/. https://gtfs.org/schedule/reference/#shapestxt. https://gtfs.org/schedule/reference/#stopstxt. https://gtfs.org/schedule/reference/#tripstxt.", "data_dict_url": "https://gisdata.dot.ca.gov/arcgis/rest/services/CHrailroad/CA_Transit_Routes/FeatureServer", "revision_date": "2024-09-18", "rename_cols": {"caltrans_district": "district_name", "organization_source_record_id": "org_id", "organization_name": "agency", "agency_name_primary": "agency_primary", "agency_name_secondary": "agency_secondary", "route_name_used": "route_name", "route_types_served": "routetypes", "n_hours_in": "n_hours_in_service", "route_ids_": "route_ids_served"}}, "ca_transit_stops": {"dataset_name": "ca_transit_stops", "publish_entity": "Data & Digital Services / California Integrated Travel Project", "summary_purpose": "Provide all CA transit stops and routes (geospatial) from all transit operators.", "description": "Provide compiled GTFS schedule data in geospatial format. Transit routes associates route information to shapes. Transit stops associates aggregated stop times and route information aggregated for each stops.", "public_access": "Public.", "creation_date": "2022-02-08", "place": "California", "status": "completed", "frequency": "monthly", "theme_topic": "transportation", "theme_keywords": "Transportation, GTFS, Transit routes, Transit stops, Transit", "data_dict_type": "XML", "readme": "https://github.com/cal-itp/data-analyses/blob/main/open_data/README.md", "readme_desc": "This site allows you to access the code used to create this dataset and provides additional explanatory resources.", "contact_organization": "Caltrans", "contact_person": "Tiffany Ku", "contact_email": "tiffany.ku@dot.ca.gov", "horiz_accuracy": "4 meters", "boilerplate_desc": "The data are made available to the public solely for informational purposes. Information provided in the Caltrans GIS Data Library is accurate to the best of our knowledge and is subject to change on a regular basis, without notice. While Caltrans makes every effort to provide useful and accurate information, we do not warrant the information Use Limitation - The data are made available to the public solely for informational purposes. Information provided in the Caltrans GIS Data Library is accurate to the best of our knowledge and is subject to change on a regular basis, without notice. While Caltrans makes every effort to provide useful and accurate information, we do not warrant the information to be authoritative, complete, factual, or timely. Information is provided on an 'as is' and an 'as available' basis. The Department of Transportation is not liable to any party for any cost or damages, including any direct, indirect, special, incidental, or consequential damages, arising out of or in connection with the access or use of, or the inability to access or use, the Site or any of the Materials or Services described herein.", "boilerplate_license": "Creative Commons 4.0 Attribution.", "methodology": "This data was assembled from the General Transit Feed Specification (GTFS) schedule data. GTFS tables are text files, but these have been compiled for all operators and transformed into geospatial data, with minimal data processing. The transit routes dataset is assembled from two tables: (1) `shapes.txt`, which defines the route alignment path, and (2) `trips.txt` and `stops.txt`, for routes not found in `shapes.txt`. `shapes.txt` is an optional GTFS table with richer information than just transit stop longitude and latitude. The transit stops dataset is assembled from `stops.txt`, which contains information about the route, stop sequence, and stop longitude and latitude. References: https://gtfs.org/. https://gtfs.org/schedule/reference/#shapestxt. https://gtfs.org/schedule/reference/#stopstxt. https://gtfs.org/schedule/reference/#tripstxt.", "data_dict_url": "https://gisdata.dot.ca.gov/arcgis/rest/services/CHrailroad/CA_Transit_Stops/FeatureServer", "revision_date": "2024-09-18", "rename_cols": {"caltrans_district": "district_name", "organization_source_record_id": "org_id", "organization_name": "agency", "agency_name_primary": "agency_primary", "agency_name_secondary": "agency_secondary", "route_name_used": "route_name", "route_types_served": "routetypes", "n_hours_in": "n_hours_in_service", "route_ids_": "route_ids_served"}}, "speeds_by_stop_segments": {"dataset_name": "speeds_by_stop_segments", "publish_entity": "Data & Digital Services / California Integrated Travel Project", "summary_purpose": "Average all-day, peak, and offpeak transit speeds by segments for all CA operators that provide GTFS real-time vehicle positions data.", "description": "All day and peak transit 20th, 50th, and 80th percentile speeds on stop segments estimated on a single day for all CA transit operators that provide GTFS real-time vehicle positions data.", "public_access": "Public.", "creation_date": "2023-06-14", "place": "California", "status": "completed", "frequency": "monthly", "theme_topic": "transportation", "theme_keywords": "Transportation, Transit, GTFS, GTFS RT, real time, speeds, vehicle positions ", "data_dict_type": "XML", "readme": "https://github.com/cal-itp/data-analyses/blob/main/rt_segment_speeds/README.md", "readme_desc": "This site allows you to access the code used to create this dataset and provides additional explanatory resources.", "contact_organization": "Caltrans", "contact_person": "Tiffany Ku / Eric Dasmalchi", "contact_email": "tiffany.ku@dot.ca.gov / eric.dasmalchi@dot.ca.gov", "horiz_accuracy": "4 meters", "boilerplate_desc": "The data are made available to the public solely for informational purposes. Information provided in the Caltrans GIS Data Library is accurate to the best of our knowledge and is subject to change on a regular basis, without notice. While Caltrans makes every effort to provide useful and accurate information, we do not warrant the information Use Limitation - The data are made available to the public solely for informational purposes. Information provided in the Caltrans GIS Data Library is accurate to the best of our knowledge and is subject to change on a regular basis, without notice. While Caltrans makes every effort to provide useful and accurate information, we do not warrant the information to be authoritative, complete, factual, or timely. Information is provided on an 'as is' and an 'as available' basis. The Department of Transportation is not liable to any party for any cost or damages, including any direct, indirect, special, incidental, or consequential damages, arising out of or in connection with the access or use of, or the inability to access or use, the Site or any of the Materials or Services described herein.", "boilerplate_license": "Creative Commons 4.0 Attribution.", "methodology": "This data was estimated by combining GTFS real-time vehicle positions to GTFS scheduled trips, shapes, stops, and stop times tables. GTFS shapes provides the route alignment path. Multiple trips may share the same shape, with a route typically associated with multiple shapes. Shapes are cut into segments at stop positions (stop_id-stop_sequence combination). A `stop segment` refers to the portion of shapes between the prior stop and the current stop. Vehicle positions are spatially joined to 35 meter buffered segments. Within each segment-trip, the first and last vehicle position observed are used to calculate the speed. Since multiple trips may occur over a segment each day, the multiple trip speeds provide a distribution. From this distribution, the 20th percentile, 50th percentile (median), and 80th percentile speeds are calculated. For all day speed metrics, all trips are used. For peak speed metrics, only trips with start times between 7 - 9:59 AM and 4 - 7:59 PM are used to find the 20th, 50th, and 80th percentile metrics. Data processing notes: (a) GTFS RT trips whose vehicle position timestamps span 10 minutes or less are dropped. Incomplete data would lead to unreliable estimates of speed at the granularity we need. (b) Segment-trip speeds of over 70 mph are excluded. These are erroneously calculated as transit does not typically reach those speeds. (c) Other missing or erroneous calculations, either arising from only one vehicle position found in a segment (change in time or change in distance cannot be calculated).", "data_dict_url": "https://gisdata.dot.ca.gov/arcgis/rest/services/CHrailroad/Speeds_By_Stop_Segments/FeatureServer", "revision_date": "2024-09-18", "rename_cols": {"stop_seque": "stop_sequence", "time_of_da": "time_of_day", "time_perio": "time_period", "district_n": "district_name", "direction_": "direction_id", "common_sha": "common_shape_id", "avg_sched_": "avg_sched_trip_min", "avg_rt_tri": "avg_rt_trip_min", "caltrans_d": "district_name", "organization_source_record_id": "org_id", "organization_name": "agency", "stop_pair_": "stop_pair_name"}}, "speeds_by_route_time_of_day": {"dataset_name": "speeds_by_route_time_of_day", "publish_entity": "Data & Digital Services / California Integrated Travel Project", "summary_purpose": "Average transit speeds by route-direction estimated on a single day for all CA transit operators that provide GTFS real-time vehicle positions data.", "description": "Provide average transit speeds, number of trips by route-direction.", "public_access": "Public.", "creation_date": "2023-06-14", "place": "California", "status": "completed", "frequency": "monthly", "theme_topic": "transportation", "theme_keywords": "Transportation, Transit, GTFS, GTFS RT, real time, speeds, vehicle positions ", "data_dict_type": "XML", "readme": "https://github.com/cal-itp/data-analyses/blob/main/rt_segment_speeds/README.md", "readme_desc": "This site allows you to access the code used to create this dataset and provides additional explanatory resources.", "contact_organization": "Caltrans", "contact_person": "Tiffany Ku", "contact_email": "tiffany.ku@dot.ca.gov", "horiz_accuracy": "4 meters", "boilerplate_desc": "The data are made available to the public solely for informational purposes. Information provided in the Caltrans GIS Data Library is accurate to the best of our knowledge and is subject to change on a regular basis, without notice. While Caltrans makes every effort to provide useful and accurate information, we do not warrant the information Use Limitation - The data are made available to the public solely for informational purposes. Information provided in the Caltrans GIS Data Library is accurate to the best of our knowledge and is subject to change on a regular basis, without notice. While Caltrans makes every effort to provide useful and accurate information, we do not warrant the information to be authoritative, complete, factual, or timely. Information is provided on an 'as is' and an 'as available' basis. The Department of Transportation is not liable to any party for any cost or damages, including any direct, indirect, special, incidental, or consequential damages, arising out of or in connection with the access or use of, or the inability to access or use, the Site or any of the Materials or Services described herein.", "boilerplate_license": "Creative Commons 4.0 Attribution.", "methodology": "This data was estimated by combining GTFS real-time vehicle positions with GTFS scheduled trips and shapes. GTFS real-time (RT) vehicle positions are spatially joined to GTFS scheduled shapes, so only vehicle positions traveling along the route alignment path are kept. A sample of five vehicle positions are selected (min, 25th percentile, 50th percentile, 75th percentile, max). The trip speed is calculated using these five vehicle positions. Each trip is categorized into a time-of-day. The average speed for a route-direction-time_of_day is calculated. Additional metrics are stored, such as the number of trips observed, the average scheduled service minutes, and the average RT observed service minutes. For convenience, we also provide a singular shape (common_shape_id) to associate with a route-direction. This is the shape that had the most number of trips for a given route-direction. Time-of-day is determined by the GTFS scheduled trip start time. The trip start hour (military time) is categorized based on the following: Owl (0-3), Early AM (4-6), AM Peak (7-9), Midday (10-14), PM Peak (15-19), and Evening (20-23). The start and end hours are inclusive (e.g., 4-6 refers to 4am, 5am, and 6am).", "data_dict_url": "https://gisdata.dot.ca.gov/arcgis/rest/services/CHrailroad/Speeds_By_Route_Time_of_Day/FeatureServer", "revision_date": "2024-09-18", "rename_cols": {"stop_seque": "stop_sequence", "time_of_da": "time_of_day", "time_perio": "time_period", "district_n": "district_name", "direction_": "direction_id", "common_sha": "common_shape_id", "avg_sched_": "avg_sched_trip_min", "avg_rt_tri": "avg_rt_trip_min", "caltrans_d": "district_name", "organization_source_record_id": "org_id", "organization_name": "agency", "stop_pair_": "stop_pair_name"}}} \ No newline at end of file +{"ca_hq_transit_areas": {"dataset_name": "ca_hq_transit_areas", "publish_entity": "Data & Digital Services / California Integrated Travel Project", "summary_purpose": "Estimated High Quality Transit Areas as described in Public Resources Code 21155, 21064.3, 21060.2.", "description": "Use GTFS schedule trips, stop_times, shapes, and stops to estimate whether corridor segments have scheduled frequencies of 15 minutes or less.", "public_access": "Public.", "creation_date": "2022-02-08", "place": "California", "status": "completed", "frequency": "monthly", "theme_topic": "transportation", "theme_keywords": "Transportation, Land Use, Transit-Oriented Development, TOD, High Quality Transit", "data_dict_type": "XML", "readme": "https://github.com/cal-itp/data-analyses/blob/main/high_quality_transit_areas/README.md", "readme_desc": "This site allows you to access the code used to create this dataset and provides additional explanatory resources.", "contact_organization": "Caltrans", "contact_person": "Eric Dasmalchi", "contact_email": "eric.dasmalchi@dot.ca.gov", "horiz_accuracy": "4 meters", "boilerplate_desc": "The data are made available to the public solely for informational purposes. Information provided in the Caltrans GIS Data Library is accurate to the best of our knowledge and is subject to change on a regular basis, without notice. While Caltrans makes every effort to provide useful and accurate information, we do not warrant the information Use Limitation - The data are made available to the public solely for informational purposes. Information provided in the Caltrans GIS Data Library is accurate to the best of our knowledge and is subject to change on a regular basis, without notice. While Caltrans makes every effort to provide useful and accurate information, we do not warrant the information to be authoritative, complete, factual, or timely. Information is provided on an 'as is' and an 'as available' basis. The Department of Transportation is not liable to any party for any cost or damages, including any direct, indirect, special, incidental, or consequential damages, arising out of or in connection with the access or use of, or the inability to access or use, the Site or any of the Materials or Services described herein.", "boilerplate_license": "Creative Commons 4.0 Attribution.", "methodology": "This data was estimated using a spatial process derived from General Transit Feed Specification (GTFS) schedule data. To find high-quality bus corridors, we split each corridor into 1,500 meter segments and counted frequencies at the stop within that segment with the highest number of transit trips. If that stop saw at least 4 trips per hour for at least one hour in the morning, and again for at least one hour in the afternoon, we consider that segment a high-quality bus corridor. Segments without a stop are not considered high-quality corridors. Major transit stops were identified as either the intersection of two high-quality corridors from the previous step, a rail or bus rapid transit station, or a ferry terminal with bus service. Note that the definition of `bus rapid transit` in Public Resources Code 21060.2 includes features not captured by available data sources, these features were captured manually using information from transit agency sources and imagery. We believe this data to be broadly accurate, and fit for purposes including overall dashboards, locating facilities in relation to high quality transit areas, and assessing community transit coverage. However, the spatial determination of high-quality transit areas from GTFS data necessarily involves some assumptions as described above. Any critical determinations of whether a specific parcel is located within a high-quality transit area should be made in conjunction with local sources, such as transit agency timetables. Notes: Null values may be present. The `hqta_details` columns defines which part of the Public Resources Code definition the HQTA classification was based on. If `hqta_details` references a single operator, then `agency_secondary` and `base64_url_secondary` are null. If `hqta_details` references the same operator, then `agency_secondary` and `base64_url_secondary` are the same as `agency_primary` and `base64_url_primary`.", "data_dict_url": "https://gisdata.dot.ca.gov/arcgis/rest/services/CHrailroad/CA_HQ_Transit_Areas/FeatureServer", "revision_date": "2024-10-16", "rename_cols": {"agency_pri": "agency_primary", "agency_sec": "agency_secondary", "hqta_detai": "hqta_details", "base64_url": "base64_url_primary", "base64_u_1": "base64_url_secondary", "org_id_pri": "org_id_primary", "org_id_sec": "org_id_secondary"}}, "ca_hq_transit_stops": {"dataset_name": "ca_hq_transit_stops", "publish_entity": "Data & Digital Services / California Integrated Travel Project", "summary_purpose": "Estimated stops along High Quality Transit Corridors, plus major transit stops for bus rapid transit, ferry, rail modes as described in Public Resources Code 21155, 21064.3, 21060.2.", "description": "Use GTFS schedule trips, stop_times, shapes, and stops to estimate whether corridor segments have scheduled frequencies of 15 minutes or less.", "public_access": "Public.", "creation_date": "2022-02-08", "place": "California", "status": "completed", "frequency": "monthly", "theme_topic": "transportation", "theme_keywords": "Transportation, Land Use, Transit-Oriented Development, TOD, High Quality Transit", "data_dict_type": "XML", "readme": "https://github.com/cal-itp/data-analyses/blob/main/high_quality_transit_areas/README.md", "readme_desc": "This site allows you to access the code used to create this dataset and provides additional explanatory resources.", "contact_organization": "Caltrans", "contact_person": "Eric Dasmalchi", "contact_email": "eric.dasmalchi@dot.ca.gov", "horiz_accuracy": "4 meters", "boilerplate_desc": "The data are made available to the public solely for informational purposes. Information provided in the Caltrans GIS Data Library is accurate to the best of our knowledge and is subject to change on a regular basis, without notice. While Caltrans makes every effort to provide useful and accurate information, we do not warrant the information Use Limitation - The data are made available to the public solely for informational purposes. Information provided in the Caltrans GIS Data Library is accurate to the best of our knowledge and is subject to change on a regular basis, without notice. While Caltrans makes every effort to provide useful and accurate information, we do not warrant the information to be authoritative, complete, factual, or timely. Information is provided on an 'as is' and an 'as available' basis. The Department of Transportation is not liable to any party for any cost or damages, including any direct, indirect, special, incidental, or consequential damages, arising out of or in connection with the access or use of, or the inability to access or use, the Site or any of the Materials or Services described herein.", "boilerplate_license": "Creative Commons 4.0 Attribution.", "methodology": "This data was estimated using a spatial process derived from General Transit Feed Specification (GTFS) schedule data. To find high-quality bus corridors, we split each corridor into 1,500 meter segments and counted frequencies at the stop within that segment with the highest number of transit trips. If that stop saw at least 4 trips per hour for at least one hour in the morning, and again for at least one hour in the afternoon, we consider that segment a high-quality bus corridor. Segments without a stop are not considered high-quality corridors. Major transit stops were identified as either the intersection of two high-quality corridors from the previous step, a rail or bus rapid transit station, or a ferry terminal with bus service. Note that the definition of `bus rapid transit` in Public Resources Code 21060.2 includes features not captured by available data sources, these features were captured manually using information from transit agency sources and imagery. We believe this data to be broadly accurate, and fit for purposes including overall dashboards, locating facilities in relation to high quality transit areas, and assessing community transit coverage. However, the spatial determination of high-quality transit areas from GTFS data necessarily involves some assumptions as described above. Any critical determinations of whether a specific parcel is located within a high-quality transit area should be made in conjunction with local sources, such as transit agency timetables. Notes: Null values may be present. The `hqta_details` columns defines which part of the Public Resources Code definition the HQTA classification was based on. If `hqta_details` references a single operator, then `agency_secondary` and `base64_url_secondary` are null. If `hqta_details` references the same operator, then `agency_secondary` and `base64_url_secondary` are the same as `agency_primary` and `base64_url_primary`.", "data_dict_url": "https://gisdata.dot.ca.gov/arcgis/rest/services/CHrailroad/CA_HQ_Transit_Stops/FeatureServer", "revision_date": "2024-10-16", "rename_cols": {"agency_pri": "agency_primary", "agency_sec": "agency_secondary", "hqta_detai": "hqta_details", "base64_url": "base64_url_primary", "base64_u_1": "base64_url_secondary", "org_id_pri": "org_id_primary", "org_id_sec": "org_id_secondary"}}, "ca_transit_routes": {"dataset_name": "ca_transit_routes", "publish_entity": "Data & Digital Services / California Integrated Travel Project", "summary_purpose": "Provide all CA transit stops and routes (geospatial) from all transit operators.", "description": "Provide compiled GTFS schedule data in geospatial format. Transit routes associates route information to shapes. Transit stops associates aggregated stop times and route information aggregated for each stops.", "public_access": "Public.", "creation_date": "2022-02-08", "place": "California", "status": "completed", "frequency": "monthly", "theme_topic": "transportation", "theme_keywords": "Transportation, GTFS, Transit routes, Transit stops, Transit", "data_dict_type": "XML", "readme": "https://github.com/cal-itp/data-analyses/blob/main/open_data/README.md", "readme_desc": "This site allows you to access the code used to create this dataset and provides additional explanatory resources.", "contact_organization": "Caltrans", "contact_person": "Tiffany Ku", "contact_email": "tiffany.ku@dot.ca.gov", "horiz_accuracy": "4 meters", "boilerplate_desc": "The data are made available to the public solely for informational purposes. Information provided in the Caltrans GIS Data Library is accurate to the best of our knowledge and is subject to change on a regular basis, without notice. While Caltrans makes every effort to provide useful and accurate information, we do not warrant the information Use Limitation - The data are made available to the public solely for informational purposes. Information provided in the Caltrans GIS Data Library is accurate to the best of our knowledge and is subject to change on a regular basis, without notice. While Caltrans makes every effort to provide useful and accurate information, we do not warrant the information to be authoritative, complete, factual, or timely. Information is provided on an 'as is' and an 'as available' basis. The Department of Transportation is not liable to any party for any cost or damages, including any direct, indirect, special, incidental, or consequential damages, arising out of or in connection with the access or use of, or the inability to access or use, the Site or any of the Materials or Services described herein.", "boilerplate_license": "Creative Commons 4.0 Attribution.", "methodology": "This data was assembled from the General Transit Feed Specification (GTFS) schedule data. GTFS tables are text files, but these have been compiled for all operators and transformed into geospatial data, with minimal data processing. The transit routes dataset is assembled from two tables: (1) `shapes.txt`, which defines the route alignment path, and (2) `trips.txt` and `stops.txt`, for routes not found in `shapes.txt`. `shapes.txt` is an optional GTFS table with richer information than just transit stop longitude and latitude. The transit stops dataset is assembled from `stops.txt`, which contains information about the route, stop sequence, and stop longitude and latitude. References: https://gtfs.org/. https://gtfs.org/schedule/reference/#shapestxt. https://gtfs.org/schedule/reference/#stopstxt. https://gtfs.org/schedule/reference/#tripstxt.", "data_dict_url": "https://gisdata.dot.ca.gov/arcgis/rest/services/CHrailroad/CA_Transit_Routes/FeatureServer", "revision_date": "2024-10-16", "rename_cols": {"caltrans_district": "district_name", "organization_source_record_id": "org_id", "organization_name": "agency", "agency_name_primary": "agency_primary", "agency_name_secondary": "agency_secondary", "route_name_used": "route_name", "route_types_served": "routetypes", "n_hours_in": "n_hours_in_service", "route_ids_": "route_ids_served"}}, "ca_transit_stops": {"dataset_name": "ca_transit_stops", "publish_entity": "Data & Digital Services / California Integrated Travel Project", "summary_purpose": "Provide all CA transit stops and routes (geospatial) from all transit operators.", "description": "Provide compiled GTFS schedule data in geospatial format. Transit routes associates route information to shapes. Transit stops associates aggregated stop times and route information aggregated for each stops.", "public_access": "Public.", "creation_date": "2022-02-08", "place": "California", "status": "completed", "frequency": "monthly", "theme_topic": "transportation", "theme_keywords": "Transportation, GTFS, Transit routes, Transit stops, Transit", "data_dict_type": "XML", "readme": "https://github.com/cal-itp/data-analyses/blob/main/open_data/README.md", "readme_desc": "This site allows you to access the code used to create this dataset and provides additional explanatory resources.", "contact_organization": "Caltrans", "contact_person": "Tiffany Ku", "contact_email": "tiffany.ku@dot.ca.gov", "horiz_accuracy": "4 meters", "boilerplate_desc": "The data are made available to the public solely for informational purposes. Information provided in the Caltrans GIS Data Library is accurate to the best of our knowledge and is subject to change on a regular basis, without notice. While Caltrans makes every effort to provide useful and accurate information, we do not warrant the information Use Limitation - The data are made available to the public solely for informational purposes. Information provided in the Caltrans GIS Data Library is accurate to the best of our knowledge and is subject to change on a regular basis, without notice. While Caltrans makes every effort to provide useful and accurate information, we do not warrant the information to be authoritative, complete, factual, or timely. Information is provided on an 'as is' and an 'as available' basis. The Department of Transportation is not liable to any party for any cost or damages, including any direct, indirect, special, incidental, or consequential damages, arising out of or in connection with the access or use of, or the inability to access or use, the Site or any of the Materials or Services described herein.", "boilerplate_license": "Creative Commons 4.0 Attribution.", "methodology": "This data was assembled from the General Transit Feed Specification (GTFS) schedule data. GTFS tables are text files, but these have been compiled for all operators and transformed into geospatial data, with minimal data processing. The transit routes dataset is assembled from two tables: (1) `shapes.txt`, which defines the route alignment path, and (2) `trips.txt` and `stops.txt`, for routes not found in `shapes.txt`. `shapes.txt` is an optional GTFS table with richer information than just transit stop longitude and latitude. The transit stops dataset is assembled from `stops.txt`, which contains information about the route, stop sequence, and stop longitude and latitude. References: https://gtfs.org/. https://gtfs.org/schedule/reference/#shapestxt. https://gtfs.org/schedule/reference/#stopstxt. https://gtfs.org/schedule/reference/#tripstxt.", "data_dict_url": "https://gisdata.dot.ca.gov/arcgis/rest/services/CHrailroad/CA_Transit_Stops/FeatureServer", "revision_date": "2024-10-16", "rename_cols": {"caltrans_district": "district_name", "organization_source_record_id": "org_id", "organization_name": "agency", "agency_name_primary": "agency_primary", "agency_name_secondary": "agency_secondary", "route_name_used": "route_name", "route_types_served": "routetypes", "n_hours_in": "n_hours_in_service", "route_ids_": "route_ids_served"}}, "speeds_by_stop_segments": {"dataset_name": "speeds_by_stop_segments", "publish_entity": "Data & Digital Services / California Integrated Travel Project", "summary_purpose": "Average all-day, peak, and offpeak transit speeds by segments for all CA operators that provide GTFS real-time vehicle positions data.", "description": "All day and peak transit 20th, 50th, and 80th percentile speeds on stop segments estimated on a single day for all CA transit operators that provide GTFS real-time vehicle positions data.", "public_access": "Public.", "creation_date": "2023-06-14", "place": "California", "status": "completed", "frequency": "monthly", "theme_topic": "transportation", "theme_keywords": "Transportation, Transit, GTFS, GTFS RT, real time, speeds, vehicle positions ", "data_dict_type": "XML", "readme": "https://github.com/cal-itp/data-analyses/blob/main/rt_segment_speeds/README.md", "readme_desc": "This site allows you to access the code used to create this dataset and provides additional explanatory resources.", "contact_organization": "Caltrans", "contact_person": "Tiffany Ku / Eric Dasmalchi", "contact_email": "tiffany.ku@dot.ca.gov / eric.dasmalchi@dot.ca.gov", "horiz_accuracy": "4 meters", "boilerplate_desc": "The data are made available to the public solely for informational purposes. Information provided in the Caltrans GIS Data Library is accurate to the best of our knowledge and is subject to change on a regular basis, without notice. While Caltrans makes every effort to provide useful and accurate information, we do not warrant the information Use Limitation - The data are made available to the public solely for informational purposes. Information provided in the Caltrans GIS Data Library is accurate to the best of our knowledge and is subject to change on a regular basis, without notice. While Caltrans makes every effort to provide useful and accurate information, we do not warrant the information to be authoritative, complete, factual, or timely. Information is provided on an 'as is' and an 'as available' basis. The Department of Transportation is not liable to any party for any cost or damages, including any direct, indirect, special, incidental, or consequential damages, arising out of or in connection with the access or use of, or the inability to access or use, the Site or any of the Materials or Services described herein.", "boilerplate_license": "Creative Commons 4.0 Attribution.", "methodology": "This data was estimated by combining GTFS real-time vehicle positions to GTFS scheduled trips, shapes, stops, and stop times tables. GTFS shapes provides the route alignment path. Multiple trips may share the same shape, with a route typically associated with multiple shapes. Shapes are cut into segments at stop positions (stop_id-stop_sequence combination). A `stop segment` refers to the portion of shapes between the prior stop and the current stop. Vehicle positions are spatially joined to 35 meter buffered segments. Within each segment-trip, the first and last vehicle position observed are used to calculate the speed. Since multiple trips may occur over a segment each day, the multiple trip speeds provide a distribution. From this distribution, the 20th percentile, 50th percentile (median), and 80th percentile speeds are calculated. For all day speed metrics, all trips are used. For peak speed metrics, only trips with start times between 7 - 9:59 AM and 4 - 7:59 PM are used to find the 20th, 50th, and 80th percentile metrics. Data processing notes: (a) GTFS RT trips whose vehicle position timestamps span 10 minutes or less are dropped. Incomplete data would lead to unreliable estimates of speed at the granularity we need. (b) Segment-trip speeds of over 70 mph are excluded. These are erroneously calculated as transit does not typically reach those speeds. (c) Other missing or erroneous calculations, either arising from only one vehicle position found in a segment (change in time or change in distance cannot be calculated).", "data_dict_url": "https://gisdata.dot.ca.gov/arcgis/rest/services/CHrailroad/Speeds_By_Stop_Segments/FeatureServer", "revision_date": "2024-10-16", "rename_cols": {"stop_seque": "stop_sequence", "time_of_da": "time_of_day", "time_perio": "time_period", "district_n": "district_name", "direction_": "direction_id", "common_sha": "common_shape_id", "avg_sched_": "avg_sched_trip_min", "avg_rt_tri": "avg_rt_trip_min", "caltrans_d": "district_name", "organization_source_record_id": "org_id", "organization_name": "agency", "stop_pair_": "stop_pair_name"}}, "speeds_by_route_time_of_day": {"dataset_name": "speeds_by_route_time_of_day", "publish_entity": "Data & Digital Services / California Integrated Travel Project", "summary_purpose": "Average transit speeds by route-direction estimated on a single day for all CA transit operators that provide GTFS real-time vehicle positions data.", "description": "Provide average transit speeds, number of trips by route-direction.", "public_access": "Public.", "creation_date": "2023-06-14", "place": "California", "status": "completed", "frequency": "monthly", "theme_topic": "transportation", "theme_keywords": "Transportation, Transit, GTFS, GTFS RT, real time, speeds, vehicle positions ", "data_dict_type": "XML", "readme": "https://github.com/cal-itp/data-analyses/blob/main/rt_segment_speeds/README.md", "readme_desc": "This site allows you to access the code used to create this dataset and provides additional explanatory resources.", "contact_organization": "Caltrans", "contact_person": "Tiffany Ku", "contact_email": "tiffany.ku@dot.ca.gov", "horiz_accuracy": "4 meters", "boilerplate_desc": "The data are made available to the public solely for informational purposes. Information provided in the Caltrans GIS Data Library is accurate to the best of our knowledge and is subject to change on a regular basis, without notice. While Caltrans makes every effort to provide useful and accurate information, we do not warrant the information Use Limitation - The data are made available to the public solely for informational purposes. Information provided in the Caltrans GIS Data Library is accurate to the best of our knowledge and is subject to change on a regular basis, without notice. While Caltrans makes every effort to provide useful and accurate information, we do not warrant the information to be authoritative, complete, factual, or timely. Information is provided on an 'as is' and an 'as available' basis. The Department of Transportation is not liable to any party for any cost or damages, including any direct, indirect, special, incidental, or consequential damages, arising out of or in connection with the access or use of, or the inability to access or use, the Site or any of the Materials or Services described herein.", "boilerplate_license": "Creative Commons 4.0 Attribution.", "methodology": "This data was estimated by combining GTFS real-time vehicle positions with GTFS scheduled trips and shapes. GTFS real-time (RT) vehicle positions are spatially joined to GTFS scheduled shapes, so only vehicle positions traveling along the route alignment path are kept. A sample of five vehicle positions are selected (min, 25th percentile, 50th percentile, 75th percentile, max). The trip speed is calculated using these five vehicle positions. Each trip is categorized into a time-of-day. The average speed for a route-direction-time_of_day is calculated. Additional metrics are stored, such as the number of trips observed, the average scheduled service minutes, and the average RT observed service minutes. For convenience, we also provide a singular shape (common_shape_id) to associate with a route-direction. This is the shape that had the most number of trips for a given route-direction. Time-of-day is determined by the GTFS scheduled trip start time. The trip start hour (military time) is categorized based on the following: Owl (0-3), Early AM (4-6), AM Peak (7-9), Midday (10-14), PM Peak (15-19), and Evening (20-23). The start and end hours are inclusive (e.g., 4-6 refers to 4am, 5am, and 6am).", "data_dict_url": "https://gisdata.dot.ca.gov/arcgis/rest/services/CHrailroad/Speeds_By_Route_Time_of_Day/FeatureServer", "revision_date": "2024-10-16", "rename_cols": {"stop_seque": "stop_sequence", "time_of_da": "time_of_day", "time_perio": "time_period", "district_n": "district_name", "direction_": "direction_id", "common_sha": "common_shape_id", "avg_sched_": "avg_sched_trip_min", "avg_rt_tri": "avg_rt_trip_min", "caltrans_d": "district_name", "organization_source_record_id": "org_id", "organization_name": "agency", "stop_pair_": "stop_pair_name"}}} \ No newline at end of file diff --git a/open_data/update_data_dict.py b/open_data/update_data_dict.py index f5b30702c..75da42d6f 100644 --- a/open_data/update_data_dict.py +++ b/open_data/update_data_dict.py @@ -1,3 +1,13 @@ +""" +Go through each dataset we publish from catalog.yml. +Compare it to data_dictionary.yml to double +check that all the columns have an entry. + +This script is useful when we make adjustments +to datasets we want to publish, and we +need to add a corresponding entry to data_dictionary.yml, +which is used to update column definitions in ESRI. +""" import geopandas as gpd import intake import sys @@ -22,6 +32,7 @@ def unpack_list_of_tables_as_dict(list_of_dict: list) -> dict: return dict_of_tables + def new_columns_for_data_dict( open_data_catalog: Union[str, Path] = Path("catalog.yml"), data_dict_file: Union[str, Path] = Path("data_dictionary.yml") diff --git a/open_data/update_vars.py b/open_data/update_vars.py index 15ed718ec..5569d44f4 100644 --- a/open_data/update_vars.py +++ b/open_data/update_vars.py @@ -1,7 +1,7 @@ from pathlib import Path from shared_utils import catalog_utils, rt_dates -analysis_date = rt_dates.DATES["sep2024"] +analysis_date = rt_dates.DATES["oct2024"] GTFS_DATA_DICT = catalog_utils.get_catalog("gtfs_analytics_data") @@ -19,8 +19,8 @@ DATA_DICT_YML = Path("data_dictionary.yml") RUN_ME = [ - "ca_hq_transit_areas", - "ca_hq_transit_stops", + #"ca_hq_transit_areas", + #"ca_hq_transit_stops", "ca_transit_routes", "ca_transit_stops", "speeds_by_stop_segments", diff --git a/open_data/xml/ca_transit_routes.xml b/open_data/xml/ca_transit_routes.xml index 6c83f0622..c9abad723 100644 --- a/open_data/xml/ca_transit_routes.xml +++ b/open_data/xml/ca_transit_routes.xml @@ -85,7 +85,7 @@ - 2024-09-18 + 2024-10-16 diff --git a/open_data/xml/ca_transit_stops.xml b/open_data/xml/ca_transit_stops.xml index b383f144d..1a5965092 100644 --- a/open_data/xml/ca_transit_stops.xml +++ b/open_data/xml/ca_transit_stops.xml @@ -85,7 +85,7 @@ - 2024-09-18 + 2024-10-16 diff --git a/open_data/xml/speeds_by_route_time_of_day.xml b/open_data/xml/speeds_by_route_time_of_day.xml index d0ea6d1f1..7a4b73f6f 100644 --- a/open_data/xml/speeds_by_route_time_of_day.xml +++ b/open_data/xml/speeds_by_route_time_of_day.xml @@ -85,7 +85,7 @@ - 2024-09-18 + 2024-10-16 diff --git a/open_data/xml/speeds_by_stop_segments.xml b/open_data/xml/speeds_by_stop_segments.xml index 2b1b8daa2..4d405e9b5 100644 --- a/open_data/xml/speeds_by_stop_segments.xml +++ b/open_data/xml/speeds_by_stop_segments.xml @@ -85,7 +85,7 @@ - 2024-09-18 + 2024-10-16 diff --git a/rt_scheduled_v_ran/logs/rt_v_scheduled_route_metrics.log b/rt_scheduled_v_ran/logs/rt_v_scheduled_route_metrics.log index 637e03c7d..882e89a13 100644 --- a/rt_scheduled_v_ran/logs/rt_v_scheduled_route_metrics.log +++ b/rt_scheduled_v_ran/logs/rt_v_scheduled_route_metrics.log @@ -70,3 +70,6 @@ 2024-08-05 10:49:43.399 | INFO | __main__:route_metrics:84 - route aggregation 2024-07-17: 0:00:02.982204 2024-08-15 13:24:21.737 | INFO | __main__:route_metrics:84 - route aggregation 2024-08-14: 0:00:02.641057 2024-09-19 13:19:02.357 | INFO | __main__:route_metrics:84 - route aggregation 2024-09-18: 0:00:02.698805 +2024-10-17 19:46:55.159 | INFO | __main__:route_metrics:85 - route aggregation 2024-10-16: 0:00:03.161050 +2024-10-18 11:06:32.782 | INFO | __main__:route_metrics:85 - route aggregation 2024-10-14: 0:00:03.113947 +2024-10-18 11:06:35.601 | INFO | __main__:route_metrics:85 - route aggregation 2024-10-15: 0:00:02.791556 diff --git a/rt_scheduled_v_ran/logs/rt_v_scheduled_trip_metrics.log b/rt_scheduled_v_ran/logs/rt_v_scheduled_trip_metrics.log index 4cc78f5a0..ec6c4a04a 100644 --- a/rt_scheduled_v_ran/logs/rt_v_scheduled_trip_metrics.log +++ b/rt_scheduled_v_ran/logs/rt_v_scheduled_trip_metrics.log @@ -453,3 +453,12 @@ 2024-09-19 12:52:30.501 | INFO | __main__:rt_schedule_trip_metrics:280 - tabular trip metrics 2024-09-18: 0:02:49.593356 2024-09-19 13:16:44.431 | INFO | __main__:rt_schedule_trip_metrics:285 - spatial trip metrics 2024-09-18: 0:24:13.930638 2024-09-19 13:18:42.287 | INFO | __main__:rt_schedule_trip_metrics:333 - Total run time for metrics on 2024-09-18: 0:29:01.379486 +2024-10-17 19:21:04.336 | INFO | __main__:rt_schedule_trip_metrics:280 - tabular trip metrics 2024-10-16: 0:03:32.986855 +2024-10-17 19:45:05.447 | INFO | __main__:rt_schedule_trip_metrics:285 - spatial trip metrics 2024-10-16: 0:24:01.110846 +2024-10-17 19:46:33.468 | INFO | __main__:rt_schedule_trip_metrics:333 - Total run time for metrics on 2024-10-16: 0:29:02.118442 +2024-10-18 10:14:34.429 | INFO | __main__:rt_schedule_trip_metrics:280 - tabular trip metrics 2024-10-14: 0:03:14.395025 +2024-10-18 10:38:13.319 | INFO | __main__:rt_schedule_trip_metrics:285 - spatial trip metrics 2024-10-14: 0:23:38.890604 +2024-10-18 10:39:31.248 | INFO | __main__:rt_schedule_trip_metrics:333 - Total run time for metrics on 2024-10-14: 0:28:11.214222 +2024-10-18 10:42:14.181 | INFO | __main__:rt_schedule_trip_metrics:280 - tabular trip metrics 2024-10-15: 0:02:42.921011 +2024-10-18 11:04:40.651 | INFO | __main__:rt_schedule_trip_metrics:285 - spatial trip metrics 2024-10-15: 0:22:26.469181 +2024-10-18 11:05:58.946 | INFO | __main__:rt_schedule_trip_metrics:333 - Total run time for metrics on 2024-10-15: 0:26:27.685521 diff --git a/rt_scheduled_v_ran/scripts/update_vars.py b/rt_scheduled_v_ran/scripts/update_vars.py index 1f8daf291..f7173653d 100644 --- a/rt_scheduled_v_ran/scripts/update_vars.py +++ b/rt_scheduled_v_ran/scripts/update_vars.py @@ -4,9 +4,10 @@ oct2023_week = rt_dates.get_week("oct2023", exclude_wed=True) apr2023_week = rt_dates.get_week("apr2023", exclude_wed=True) apr2024_week = rt_dates.get_week("apr2024", exclude_wed=True) +oct2024_week = rt_dates.get_week("oct2024", exclude_wed=True) -# analysis_date_list = [rt_dates.DATES["sep2024"]] -analysis_date_list = rt_dates.y2024_dates + rt_dates.y2023_dates +#analysis_date_list = [rt_dates.DATES["oct2024"]] +analysis_date_list = [rt_dates.DATES[f"oct2024{i}"] for i in ["a", "b"]] GTFS_DATA_DICT = catalog_utils.get_catalog("gtfs_analytics_data") diff --git a/rt_segment_speeds/logs/avg_speeds.log b/rt_segment_speeds/logs/avg_speeds.log index 816292e0c..589b582aa 100644 --- a/rt_segment_speeds/logs/avg_speeds.log +++ b/rt_segment_speeds/logs/avg_speeds.log @@ -455,3 +455,27 @@ 2024-09-19 12:42:38.061 | INFO | average_segment_speeds:single_day_segment_averages:173 - shape seg avg 0:05:30.126380 2024-09-19 12:46:50.506 | INFO | average_segment_speeds:single_day_segment_averages:189 - route dir seg avg 0:04:12.445389 2024-09-19 12:46:50.507 | INFO | average_segment_speeds:single_day_segment_averages:190 - single day segment 2024-09-18 execution time: 0:09:42.571769 +2024-10-17 18:17:38.147 | INFO | __main__:single_day_summary_averages:90 - trip avg 0:00:18.002158 +2024-10-17 18:17:51.697 | INFO | __main__:single_day_summary_averages:132 - route dir avg: 0:00:13.550000 +2024-10-17 18:17:51.698 | INFO | __main__:single_day_summary_averages:133 - single day summary speed 2024-10-16 execution time: 0:00:31.552158 +2024-10-17 18:36:28.002 | INFO | average_segment_speeds:single_day_segment_averages:177 - shape seg avg 0:06:04.775175 +2024-10-17 18:40:42.231 | INFO | average_segment_speeds:single_day_segment_averages:193 - route dir seg avg 0:04:14.229086 +2024-10-17 18:40:42.232 | INFO | average_segment_speeds:single_day_segment_averages:194 - single day segment 2024-10-16 execution time: 0:10:19.004261 +2024-10-18 00:34:00.973 | INFO | __main__:single_day_segment_averages:177 - shape seg avg 0:04:53.730922 +2024-10-18 00:37:30.933 | INFO | __main__:single_day_segment_averages:193 - route dir seg avg 0:03:29.959724 +2024-10-18 00:37:30.935 | INFO | __main__:single_day_segment_averages:194 - single day segment 2024-10-14 execution time: 0:08:23.690646 +2024-10-18 00:42:04.604 | INFO | __main__:single_day_segment_averages:177 - shape seg avg 0:04:33.336592 +2024-10-18 00:45:18.448 | INFO | __main__:single_day_segment_averages:193 - route dir seg avg 0:03:13.843336 +2024-10-18 00:45:18.448 | INFO | __main__:single_day_segment_averages:194 - single day segment 2024-10-15 execution time: 0:07:47.179928 +2024-10-18 09:24:57.077 | INFO | __main__:single_day_summary_averages:90 - trip avg 0:00:17.634204 +2024-10-18 09:25:10.181 | INFO | __main__:single_day_summary_averages:132 - route dir avg: 0:00:13.104249 +2024-10-18 09:25:10.183 | INFO | __main__:single_day_summary_averages:133 - single day summary speed 2024-10-14 execution time: 0:00:30.738453 +2024-10-18 09:25:25.944 | INFO | __main__:single_day_summary_averages:90 - trip avg 0:00:15.625256 +2024-10-18 09:25:38.786 | INFO | __main__:single_day_summary_averages:132 - route dir avg: 0:00:12.841694 +2024-10-18 09:25:38.787 | INFO | __main__:single_day_summary_averages:133 - single day summary speed 2024-10-15 execution time: 0:00:28.466950 +2024-10-18 09:53:22.768 | INFO | average_segment_speeds:single_day_segment_averages:177 - shape seg avg 0:06:01.586897 +2024-10-18 09:57:18.027 | INFO | average_segment_speeds:single_day_segment_averages:193 - route dir seg avg 0:03:55.259299 +2024-10-18 09:57:18.027 | INFO | average_segment_speeds:single_day_segment_averages:194 - single day segment 2024-10-14 execution time: 0:09:56.846196 +2024-10-18 10:02:43.872 | INFO | average_segment_speeds:single_day_segment_averages:177 - shape seg avg 0:05:25.476441 +2024-10-18 10:06:42.688 | INFO | average_segment_speeds:single_day_segment_averages:193 - route dir seg avg 0:03:58.815162 +2024-10-18 10:06:42.688 | INFO | average_segment_speeds:single_day_segment_averages:194 - single day segment 2024-10-15 execution time: 0:09:24.291603 diff --git a/rt_segment_speeds/logs/cut_stop_segments.log b/rt_segment_speeds/logs/cut_stop_segments.log index 39bf1068a..52742d934 100644 --- a/rt_segment_speeds/logs/cut_stop_segments.log +++ b/rt_segment_speeds/logs/cut_stop_segments.log @@ -47,3 +47,7 @@ 2024-08-15 11:01:37.861 | INFO | __main__::244 - speedmap segments and proxy_stop_times 2024-08-14: 0:04:20.718384 2024-09-19 10:45:10.417 | INFO | __main__::155 - cut segments 2024-09-18: 0:22:12.922031 2024-09-19 10:51:18.211 | INFO | __main__::244 - speedmap segments and proxy_stop_times 2024-09-18: 0:04:36.568172 +2024-10-17 22:33:30.209 | INFO | __main__::155 - cut segments 2024-10-14: 0:22:54.280484 +2024-10-17 22:56:42.706 | INFO | __main__::155 - cut segments 2024-10-15: 0:23:12.453821 +2024-10-17 23:03:58.736 | INFO | __main__::244 - speedmap segments and proxy_stop_times 2024-10-14: 0:04:57.164201 +2024-10-17 23:09:24.490 | INFO | __main__::244 - speedmap segments and proxy_stop_times 2024-10-15: 0:05:10.800709 diff --git a/rt_segment_speeds/logs/interpolate_stop_arrival.log b/rt_segment_speeds/logs/interpolate_stop_arrival.log index 29cc7fc4a..8726e8661 100644 --- a/rt_segment_speeds/logs/interpolate_stop_arrival.log +++ b/rt_segment_speeds/logs/interpolate_stop_arrival.log @@ -101,3 +101,11 @@ 2024-08-15 12:42:32.459 | INFO | interpolate_stop_arrival:interpolate_stop_arrivals:279 - interpolate arrivals for speedmap_segments 2024-08-14: 2024-08-14: 0:02:27.666741 2024-09-19 11:34:46.012 | INFO | interpolate_stop_arrival:interpolate_stop_arrivals:279 - interpolate arrivals for stop_segments 2024-09-18: 2024-09-18: 0:15:34.067479 2024-09-19 12:22:50.153 | INFO | interpolate_stop_arrival:interpolate_stop_arrivals:279 - interpolate arrivals for rt_stop_times 2024-09-18: 2024-09-18: 0:15:01.401473 +2024-10-17 18:14:44.595 | INFO | interpolate_stop_arrival:interpolate_stop_arrivals:279 - interpolate arrivals for rt_stop_times 2024-10-16: 2024-10-16: 0:15:28.990321 +2024-10-17 18:27:58.463 | INFO | interpolate_stop_arrival:interpolate_stop_arrivals:279 - interpolate arrivals for speedmap_segments 2024-10-16: 2024-10-16: 0:02:51.047734 +2024-10-18 00:10:21.879 | INFO | interpolate_stop_arrival:interpolate_stop_arrivals:279 - interpolate arrivals for stop_segments 2024-10-14: 2024-10-14: 0:15:09.612106 +2024-10-18 00:25:38.170 | INFO | interpolate_stop_arrival:interpolate_stop_arrivals:279 - interpolate arrivals for stop_segments 2024-10-15: 2024-10-15: 0:15:16.185162 +2024-10-18 09:05:21.436 | INFO | interpolate_stop_arrival:interpolate_stop_arrivals:279 - interpolate arrivals for rt_stop_times 2024-10-14: 2024-10-14: 0:14:33.877187 +2024-10-18 09:19:52.143 | INFO | interpolate_stop_arrival:interpolate_stop_arrivals:279 - interpolate arrivals for rt_stop_times 2024-10-15: 2024-10-15: 0:14:30.651162 +2024-10-18 09:40:29.727 | INFO | interpolate_stop_arrival:interpolate_stop_arrivals:279 - interpolate arrivals for speedmap_segments 2024-10-14: 2024-10-14: 0:02:44.046820 +2024-10-18 09:43:15.358 | INFO | interpolate_stop_arrival:interpolate_stop_arrivals:279 - interpolate arrivals for speedmap_segments 2024-10-15: 2024-10-15: 0:02:45.584039 diff --git a/rt_segment_speeds/logs/nearest_vp.log b/rt_segment_speeds/logs/nearest_vp.log index bcb9357da..829733555 100644 --- a/rt_segment_speeds/logs/nearest_vp.log +++ b/rt_segment_speeds/logs/nearest_vp.log @@ -207,3 +207,18 @@ 2024-09-19 12:07:48.692 | INFO | vp_around_stops:filter_to_nearest_two_vp:247 - nearest 2 vp for rt_stop_times 2024-09-18: 0:10:11.973530 2024-09-19 12:28:39.454 | INFO | nearest_vp_to_stop:nearest_neighbor_for_stop:178 - nearest neighbor for speedmap_segments 2024-09-18: 0:02:33.742427 2024-09-19 12:32:09.310 | INFO | vp_around_stops:filter_to_nearest_two_vp:247 - nearest 2 vp for speedmap_segments 2024-09-18: 0:03:29.417591 +2024-10-17 17:59:15.541 | INFO | vp_around_stops:filter_to_nearest_two_vp:247 - nearest 2 vp for rt_stop_times 2024-10-16: 0:10:43.659403 +2024-10-17 18:21:01.949 | INFO | nearest_vp_to_stop:nearest_neighbor_for_stop:178 - nearest neighbor for speedmap_segments 2024-10-16: 0:02:51.936355 +2024-10-17 18:25:07.376 | INFO | vp_around_stops:filter_to_nearest_two_vp:247 - nearest 2 vp for speedmap_segments 2024-10-16: 0:04:04.899447 +2024-10-17 23:22:37.629 | INFO | nearest_vp_to_stop:nearest_neighbor_for_stop:178 - nearest neighbor for stop_segments 2024-10-14: 0:12:54.173754 +2024-10-17 23:35:33.876 | INFO | nearest_vp_to_stop:nearest_neighbor_for_stop:178 - nearest neighbor for stop_segments 2024-10-15: 0:12:53.836177 +2024-10-17 23:45:23.583 | INFO | vp_around_stops:filter_to_nearest_two_vp:247 - nearest 2 vp for stop_segments 2024-10-14: 0:09:47.459953 +2024-10-17 23:55:12.205 | INFO | vp_around_stops:filter_to_nearest_two_vp:247 - nearest 2 vp for stop_segments 2024-10-15: 0:09:48.537422 +2024-10-18 00:58:35.591 | INFO | nearest_vp_to_stop:nearest_neighbor_for_stop:178 - nearest neighbor for rt_stop_times 2024-10-14: 0:12:59.351927 +2024-10-18 01:11:34.610 | INFO | nearest_vp_to_stop:nearest_neighbor_for_stop:178 - nearest neighbor for rt_stop_times 2024-10-15: 0:12:56.742090 +2024-10-18 01:22:14.543 | INFO | vp_around_stops:filter_to_nearest_two_vp:247 - nearest 2 vp for rt_stop_times 2024-10-14: 0:10:37.479636 +2024-10-18 08:50:47.496 | INFO | vp_around_stops:filter_to_nearest_two_vp:247 - nearest 2 vp for rt_stop_times 2024-10-15: 0:10:07.060145 +2024-10-18 09:28:31.780 | INFO | nearest_vp_to_stop:nearest_neighbor_for_stop:178 - nearest neighbor for speedmap_segments 2024-10-14: 0:02:34.578154 +2024-10-18 09:30:50.351 | INFO | nearest_vp_to_stop:nearest_neighbor_for_stop:178 - nearest neighbor for speedmap_segments 2024-10-15: 0:02:18.112697 +2024-10-18 09:34:19.355 | INFO | vp_around_stops:filter_to_nearest_two_vp:247 - nearest 2 vp for speedmap_segments 2024-10-14: 0:03:28.557971 +2024-10-18 09:37:45.651 | INFO | vp_around_stops:filter_to_nearest_two_vp:247 - nearest 2 vp for speedmap_segments 2024-10-15: 0:03:26.270136 diff --git a/rt_segment_speeds/logs/speeds_by_segment_trip.log b/rt_segment_speeds/logs/speeds_by_segment_trip.log index 18527e3c9..6146cca32 100644 --- a/rt_segment_speeds/logs/speeds_by_segment_trip.log +++ b/rt_segment_speeds/logs/speeds_by_segment_trip.log @@ -113,3 +113,11 @@ 2024-09-19 11:36:29.235 | INFO | stop_arrivals_to_speed:calculate_speed_from_stop_arrivals:176 - speeds by segment for stop_segments 2024-09-18: 0:01:43.166792 2024-09-19 12:25:01.693 | INFO | stop_arrivals_to_speed:calculate_speed_from_stop_arrivals:176 - speeds by segment for rt_stop_times 2024-09-18: 0:02:11.499303 2024-09-19 12:36:50.740 | INFO | stop_arrivals_to_speed:calculate_speed_from_stop_arrivals:176 - speeds by segment for speedmap_segments 2024-09-18: 0:01:46.975907 +2024-10-17 18:17:00.860 | INFO | stop_arrivals_to_speed:calculate_speed_from_stop_arrivals:176 - speeds by segment for rt_stop_times 2024-10-16: 0:02:16.190608 +2024-10-17 18:30:04.301 | INFO | stop_arrivals_to_speed:calculate_speed_from_stop_arrivals:176 - speeds by segment for speedmap_segments 2024-10-16: 0:01:55.327914 +2024-10-18 00:27:16.221 | INFO | stop_arrivals_to_speed:calculate_speed_from_stop_arrivals:176 - speeds by segment for stop_segments 2024-10-14: 0:01:37.987755 +2024-10-18 00:28:47.254 | INFO | stop_arrivals_to_speed:calculate_speed_from_stop_arrivals:176 - speeds by segment for stop_segments 2024-10-15: 0:01:31.008540 +2024-10-18 09:22:02.431 | INFO | stop_arrivals_to_speed:calculate_speed_from_stop_arrivals:176 - speeds by segment for rt_stop_times 2024-10-14: 0:02:10.249128 +2024-10-18 09:24:16.361 | INFO | stop_arrivals_to_speed:calculate_speed_from_stop_arrivals:176 - speeds by segment for rt_stop_times 2024-10-15: 0:02:13.909495 +2024-10-18 09:45:15.291 | INFO | stop_arrivals_to_speed:calculate_speed_from_stop_arrivals:176 - speeds by segment for speedmap_segments 2024-10-14: 0:01:41.138216 +2024-10-18 09:47:03.928 | INFO | stop_arrivals_to_speed:calculate_speed_from_stop_arrivals:176 - speeds by segment for speedmap_segments 2024-10-15: 0:01:48.632317 diff --git a/rt_segment_speeds/segment_speed_utils/project_vars.py b/rt_segment_speeds/segment_speed_utils/project_vars.py index 3c9f7a259..e9419ee8d 100644 --- a/rt_segment_speeds/segment_speed_utils/project_vars.py +++ b/rt_segment_speeds/segment_speed_utils/project_vars.py @@ -11,24 +11,26 @@ SHARED_GCS = GTFS_DATA_DICT.gcs_paths.SHARED_GCS PUBLIC_GCS = GTFS_DATA_DICT.gcs_paths.PUBLIC_GCS -analysis_date = rt_dates.DATES["sep2024"] +analysis_date = rt_dates.DATES["oct2024"] oct2023_week = rt_dates.get_week("oct2023", exclude_wed=True) apr2023_week = rt_dates.get_week("apr2023", exclude_wed=True) apr2024_week = rt_dates.get_week("apr2024", exclude_wed=True) +oct2024_week = rt_dates.get_week("oct2024", exclude_wed=True) all_dates = ( rt_dates.y2024_dates + rt_dates.y2023_dates + - apr2024_week + oct2023_week + apr2023_week + oct2024_week + apr2024_week + oct2023_week + apr2023_week ) weeks_available = [ - rt_dates.apr2024_week, rt_dates.oct2023_week, - rt_dates.apr2023_week + rt_dates.oct2024_week, rt_dates.apr2024_week, + rt_dates.oct2023_week, rt_dates.apr2023_week, ] -analysis_date_list = [analysis_date] +#analysis_date_list = [analysis_date] +analysis_date_list = [rt_dates.DATES[f"oct2024{i}"] for i in ["a", "b"]] PROJECT_CRS = "EPSG:3310"