From 47ceef7172e0aa545b4851385377eb870b2d5fe0 Mon Sep 17 00:00:00 2001 From: tiffanychu90 Date: Tue, 27 Feb 2024 19:27:54 +0000 Subject: [PATCH] refactor open data publishing, add publish_utils --- open_data/README.md | 3 + open_data/check_exported_data.ipynb | 31 ++++++++++ open_data/gcs_to_esri.py | 53 ++-------------- open_data/metadata.json | 2 +- open_data/open_data.py | 10 +-- open_data/publish_utils.py | 61 +++++++++++++++++++ open_data/supplement_meta.py | 30 +-------- open_data/update_data_dict.py | 16 ++++- open_data/update_vars.py | 11 +++- open_data/xml/ca_hq_transit_areas.xml | 4 +- open_data/xml/ca_hq_transit_stops.xml | 4 +- open_data/xml/ca_transit_routes.xml | 4 +- open_data/xml/ca_transit_stops.xml | 4 +- open_data/xml/speeds_by_route_time_of_day.xml | 4 +- open_data/xml/speeds_by_stop_segments.xml | 4 +- 15 files changed, 141 insertions(+), 100 deletions(-) create mode 100644 open_data/publish_utils.py diff --git a/open_data/README.md b/open_data/README.md index 50d9edcda..2ef89205f 100644 --- a/open_data/README.md +++ b/open_data/README.md @@ -44,6 +44,9 @@ Traffic Ops had a request for all transit routes and transit stops to be publish ### Metadata * [Metadata](./metadata.yml) * [Data dictionary](./data_dictionary.yml) +* [update_vars](./update_vars.py) and [publish_utils](./publish_utils.py) contain a lot of the variables that would frequently get updated in the publishing process. + * Apply standardized column names across published datasets, even they differ from internal keys (`org_id` in favor of `gtfs_dataset_key`, `agency` in favor of `organization_name`). + * Since we do not save multiple versions of published datasets, the columns are renamed prior to exporting the geoparquet as a zipped shapefile. ## Open Data Intake Process * Open a [ticket](https://forms.office.com/Pages/ResponsePage.aspx?id=ZAobYkAXzEONiEVA00h1VuRQZHWRcbdNm496kj4opnZUNUo1NjRNRFpIOVRBMVFFTFJDM1JKNkY0SC4u) on the Intranet to update or add new services and provide [justification](./intake_justification.md) \ No newline at end of file diff --git a/open_data/check_exported_data.ipynb b/open_data/check_exported_data.ipynb index e69a3061f..476743047 100644 --- a/open_data/check_exported_data.ipynb +++ b/open_data/check_exported_data.ipynb @@ -178,6 +178,16 @@ "print_stats(gdf)" ] }, + { + "cell_type": "code", + "execution_count": null, + "id": "b9c367fe-c7c1-4ca3-8f64-ecf94f99bd99", + "metadata": {}, + "outputs": [], + "source": [ + "gdf[gdf.hqta_type==\"major_stop_brt\"].route_id.value_counts()" + ] + }, { "cell_type": "code", "execution_count": null, @@ -254,6 +264,27 @@ "print_stats(gdf)" ] }, + { + "cell_type": "code", + "execution_count": null, + "id": "46f3badd-a344-49c1-a523-761251dd8e32", + "metadata": {}, + "outputs": [], + "source": [ + "gdf.p50_mph.hist(bins=range(0, 80, 5))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "221fa13b-14d4-4eb1-b70d-9621b82720ee", + "metadata": {}, + "outputs": [], + "source": [ + "for col in [\"p20_mph\", \"p50_mph\", \"p80_mph\"]:\n", + " print(gdf[col].describe())" + ] + }, { "cell_type": "code", "execution_count": null, diff --git a/open_data/gcs_to_esri.py b/open_data/gcs_to_esri.py index 832a6ac83..c9857fa68 100644 --- a/open_data/gcs_to_esri.py +++ b/open_data/gcs_to_esri.py @@ -12,51 +12,12 @@ from loguru import logger -import open_data +import publish_utils from calitp_data_analysis import utils, geography_utils -from shared_utils import portfolio_utils -from update_vars import analysis_date +from update_vars import analysis_date, RUN_ME catalog = intake.open_catalog("./catalog.yml") -def standardize_column_names(df: gpd.GeoDataFrame) -> gpd.GeoDataFrame: - """ - Standardize how agency is referred to. - """ - RENAME_DICT = { - "caltrans_district": "district_name", - "organization_source_record_id": "org_id", - "organization_name": "org_name" - } - # these rename hqta datasets - # agency_name_primary, agency_name_secondary, etc - df.columns = df.columns.str.replace('agency_name', 'agency') - - df = df.rename(columns = RENAME_DICT) - df - - return df - - -def remove_internal_keys(df: gpd.GeoDataFrame) -> gpd.GeoDataFrame: - """ - Remove columns used in our internal data modeling. - Leave only natural identifiers (route_id, shape_id). - Remove shape_array_key, gtfs_dataset_key, etc. - """ - exclude_list = [ - "sec_elapsed", "meters_elapsed", - "name" #schedule_gtfs_dataset_name - ] - cols = [c for c in df.columns] - - internal_cols = [c for c in cols if "_key" in c or c in exclude_list] - - print(f"drop: {internal_cols}") - - return df.drop(columns = internal_cols) - - def print_info(gdf: gpd.GeoDataFrame): """ Double check that the metadata is entered correctly and @@ -89,13 +50,11 @@ def remove_zipped_shapefiles(): logger.add(sys.stderr, format="{time:YYYY-MM-DD at HH:mm:ss} | {level} | {message}", level="INFO") - - #datasets = list(dict(catalog).keys()) - datasets = open_data.RUN_ME - - for d in datasets : + + for d in RUN_ME : gdf = catalog[d].read().to_crs(geography_utils.WGS84) - gdf = standardize_column_names(gdf).pipe(remove_internal_keys) + gdf = publish_utils.standardize_column_names(gdf).pipe( + publish_utils.remove_internal_keys) logger.info(f"********* {d} *************") print_info(gdf) diff --git a/open_data/metadata.json b/open_data/metadata.json index c87deaf3f..319cd5d72 100644 --- a/open_data/metadata.json +++ b/open_data/metadata.json @@ -1 +1 @@ -{"ca_hq_transit_areas": {"dataset_name": "ca_hq_transit_areas", "publish_entity": "Data & Digital Services / California Integrated Travel Project", "summary_purpose": "Estimated High Quality Transit Areas as described in Public Resources Code 21155, 21064.3, 21060.2.", "description": "Use GTFS schedule trips, stop_times, shapes, and stops to estimate whether corridor segments have scheduled frequencies of 15 minutes or less.", "public_access": "Public.", "creation_date": "2022-02-08", "place": "California", "status": "completed", "frequency": "monthly", "theme_topic": "transportation", "theme_keywords": "Transportation, Land Use, Transit-Oriented Development, TOD, High Quality Transit", "data_dict_type": "XML", "readme": "https://github.com/cal-itp/data-analyses/blob/main/high_quality_transit_areas/README.md", "readme_desc": "This site allows you to access the code used to create this dataset and provides additional explanatory resources.", "contact_organization": "Caltrans", "contact_person": "Eric Dasmalchi", "contact_email": "eric.dasmalchi@dot.ca.gov", "horiz_accuracy": "4 meters", "boilerplate_desc": "The data are made available to the public solely for informational purposes. Information provided in the Caltrans GIS Data Library is accurate to the best of our knowledge and is subject to change on a regular basis, without notice. While Caltrans makes every effort to provide useful and accurate information, we do not warrant the information Use Limitation - The data are made available to the public solely for informational purposes. Information provided in the Caltrans GIS Data Library is accurate to the best of our knowledge and is subject to change on a regular basis, without notice. While Caltrans makes every effort to provide useful and accurate information, we do not warrant the information to be authoritative, complete, factual, or timely. Information is provided on an 'as is' and an 'as available' basis. The Department of Transportation is not liable to any party for any cost or damages, including any direct, indirect, special, incidental, or consequential damages, arising out of or in connection with the access or use of, or the inability to access or use, the Site or any of the Materials or Services described herein.", "boilerplate_license": "Creative Commons 4.0 Attribution.", "methodology": "This data was estimated using a spatial process derived from General Transit Feed Specification (GTFS) schedule data. To find high-quality bus corridors, we split each corridor into 1,500 meter segments and counted frequencies at the stop within that segment with the highest number of transit trips. If that stop saw at least 4 trips per hour for at least one hour in the morning, and again for at least one hour in the afternoon, we consider that segment a high-quality bus corridor. Segments without a stop are not considered high-quality corridors. Major transit stops were identified as either the intersection of two high-quality corridors from the previous step, a rail or bus rapid transit station, or a ferry terminal with bus service. Note that the definition of `bus rapid transit` in Public Resources Code 21060.2 includes features not captured by available data sources, these features were captured manually using information from transit agency sources and imagery. We believe this data to be broadly accurate, and fit for purposes including overall dashboards, locating facilities in relation to high quality transit areas, and assessing community transit coverage. However, the spatial determination of high-quality transit areas from GTFS data necessarily involves some assumptions as described above. Any critical determinations of whether a specific parcel is located within a high-quality transit area should be made in conjunction with local sources, such as transit agency timetables. Notes: Null values may be present. The `hqta_details` columns defines which part of the Public Resources Code definition the HQTA classification was based on. If `hqta_details` references a single operator, then `agency_secondary` and `base64_url_secondary` are null. If `hqta_details` references the same operator, then `agency_secondary` and `base64_url_secondary` are the same as `agency_primary` and `base64_url_primary`.", "data_dict_url": "https://gisdata.dot.ca.gov/arcgis/rest/services/CHrailroad/CA_HQ_Transit_Areas/FeatureServer", "revision_date": "2024-01-17", "rename_cols": {"agency_pri": "agency_primary", "agency_sec": "agency_secondary", "hqta_detai": "hqta_details", "base64_url": "base64_url_primary", "base64_u_1": "base64_url_secondary", "org_id_pri": "org_id_primary", "org_id_sec": "org_id_secondary"}}, "ca_hq_transit_stops": {"dataset_name": "ca_hq_transit_stops", "publish_entity": "Data & Digital Services / California Integrated Travel Project", "summary_purpose": "Estimated stops along High Quality Transit Corridors, plus major transit stops for bus rapid transit, ferry, rail modes as described in Public Resources Code 21155, 21064.3, 21060.2.", "description": "Use GTFS schedule trips, stop_times, shapes, and stops to estimate whether corridor segments have scheduled frequencies of 15 minutes or less.", "public_access": "Public.", "creation_date": "2022-02-08", "place": "California", "status": "completed", "frequency": "monthly", "theme_topic": "transportation", "theme_keywords": "Transportation, Land Use, Transit-Oriented Development, TOD, High Quality Transit", "data_dict_type": "XML", "readme": "https://github.com/cal-itp/data-analyses/blob/main/high_quality_transit_areas/README.md", "readme_desc": "This site allows you to access the code used to create this dataset and provides additional explanatory resources.", "contact_organization": "Caltrans", "contact_person": "Eric Dasmalchi", "contact_email": "eric.dasmalchi@dot.ca.gov", "horiz_accuracy": "4 meters", "boilerplate_desc": "The data are made available to the public solely for informational purposes. Information provided in the Caltrans GIS Data Library is accurate to the best of our knowledge and is subject to change on a regular basis, without notice. While Caltrans makes every effort to provide useful and accurate information, we do not warrant the information Use Limitation - The data are made available to the public solely for informational purposes. Information provided in the Caltrans GIS Data Library is accurate to the best of our knowledge and is subject to change on a regular basis, without notice. While Caltrans makes every effort to provide useful and accurate information, we do not warrant the information to be authoritative, complete, factual, or timely. Information is provided on an 'as is' and an 'as available' basis. The Department of Transportation is not liable to any party for any cost or damages, including any direct, indirect, special, incidental, or consequential damages, arising out of or in connection with the access or use of, or the inability to access or use, the Site or any of the Materials or Services described herein.", "boilerplate_license": "Creative Commons 4.0 Attribution.", "methodology": "This data was estimated using a spatial process derived from General Transit Feed Specification (GTFS) schedule data. To find high-quality bus corridors, we split each corridor into 1,500 meter segments and counted frequencies at the stop within that segment with the highest number of transit trips. If that stop saw at least 4 trips per hour for at least one hour in the morning, and again for at least one hour in the afternoon, we consider that segment a high-quality bus corridor. Segments without a stop are not considered high-quality corridors. Major transit stops were identified as either the intersection of two high-quality corridors from the previous step, a rail or bus rapid transit station, or a ferry terminal with bus service. Note that the definition of `bus rapid transit` in Public Resources Code 21060.2 includes features not captured by available data sources, these features were captured manually using information from transit agency sources and imagery. We believe this data to be broadly accurate, and fit for purposes including overall dashboards, locating facilities in relation to high quality transit areas, and assessing community transit coverage. However, the spatial determination of high-quality transit areas from GTFS data necessarily involves some assumptions as described above. Any critical determinations of whether a specific parcel is located within a high-quality transit area should be made in conjunction with local sources, such as transit agency timetables. Notes: Null values may be present. The `hqta_details` columns defines which part of the Public Resources Code definition the HQTA classification was based on. If `hqta_details` references a single operator, then `agency_secondary` and `base64_url_secondary` are null. If `hqta_details` references the same operator, then `agency_secondary` and `base64_url_secondary` are the same as `agency_primary` and `base64_url_primary`.", "data_dict_url": "https://gisdata.dot.ca.gov/arcgis/rest/services/CHrailroad/CA_HQ_Transit_Stops/FeatureServer", "revision_date": "2024-01-17", "rename_cols": {"agency_pri": "agency_primary", "agency_sec": "agency_secondary", "hqta_detai": "hqta_details", "base64_url": "base64_url_primary", "base64_u_1": "base64_url_secondary", "org_id_pri": "org_id_primary", "org_id_sec": "org_id_secondary"}}, "ca_transit_routes": {"dataset_name": "ca_transit_routes", "publish_entity": "Data & Digital Services / California Integrated Travel Project", "summary_purpose": "Provide all CA transit stops and routes (geospatial) from all transit operators.", "description": "Provide compiled GTFS schedule data in geospatial format. Transit routes associates route information to shapes. Transit stops associates route information to stops.", "public_access": "Public.", "creation_date": "2022-02-08", "place": "California", "status": "completed", "frequency": "monthly", "theme_topic": "transportation", "theme_keywords": "Transportation, GTFS, Transit routes, Transit stops, Transit", "data_dict_type": "XML", "readme": "https://github.com/cal-itp/data-analyses/blob/main/open_data/README.md", "readme_desc": "This site allows you to access the code used to create this dataset and provides additional explanatory resources.", "contact_organization": "Caltrans", "contact_person": "Tiffany Ku", "contact_email": "tiffany.ku@dot.ca.gov", "horiz_accuracy": "4 meters", "boilerplate_desc": "The data are made available to the public solely for informational purposes. Information provided in the Caltrans GIS Data Library is accurate to the best of our knowledge and is subject to change on a regular basis, without notice. While Caltrans makes every effort to provide useful and accurate information, we do not warrant the information Use Limitation - The data are made available to the public solely for informational purposes. Information provided in the Caltrans GIS Data Library is accurate to the best of our knowledge and is subject to change on a regular basis, without notice. While Caltrans makes every effort to provide useful and accurate information, we do not warrant the information to be authoritative, complete, factual, or timely. Information is provided on an 'as is' and an 'as available' basis. The Department of Transportation is not liable to any party for any cost or damages, including any direct, indirect, special, incidental, or consequential damages, arising out of or in connection with the access or use of, or the inability to access or use, the Site or any of the Materials or Services described herein.", "boilerplate_license": "Creative Commons 4.0 Attribution.", "methodology": "This data was assembled from the General Transit Feed Specification (GTFS) schedule data. GTFS tables are text files, but these have been compiled for all operators and transformed into geospatial data, with minimal data processing. The transit routes dataset is assembled from two tables: (1) `shapes.txt`, which defines the route alignment path, and (2) `trips.txt` and `stops.txt`, for routes not found in `shapes.txt`. `shapes.txt` is an optional GTFS table with richer information than just transit stop longitude and latitude. The transit stops dataset is assembled from `stops.txt`, which contains information about the route, stop sequence, and stop longitude and latitude. References: https://gtfs.org/. https://gtfs.org/schedule/reference/#shapestxt. https://gtfs.org/schedule/reference/#stopstxt. https://gtfs.org/schedule/reference/#tripstxt.", "data_dict_url": "https://gisdata.dot.ca.gov/arcgis/rest/services/CHrailroad/CA_Transit_Routes/FeatureServer", "revision_date": "2024-01-17"}, "ca_transit_stops": {"dataset_name": "ca_transit_stops", "publish_entity": "Data & Digital Services / California Integrated Travel Project", "summary_purpose": "Provide all CA transit stops and routes (geospatial) from all transit operators.", "description": "Provide compiled GTFS schedule data in geospatial format. Transit routes associates route information to shapes. Transit stops associates route information to stops.", "public_access": "Public.", "creation_date": "2022-02-08", "place": "California", "status": "completed", "frequency": "monthly", "theme_topic": "transportation", "theme_keywords": "Transportation, GTFS, Transit routes, Transit stops, Transit", "data_dict_type": "XML", "readme": "https://github.com/cal-itp/data-analyses/blob/main/open_data/README.md", "readme_desc": "This site allows you to access the code used to create this dataset and provides additional explanatory resources.", "contact_organization": "Caltrans", "contact_person": "Tiffany Ku", "contact_email": "tiffany.ku@dot.ca.gov", "horiz_accuracy": "4 meters", "boilerplate_desc": "The data are made available to the public solely for informational purposes. Information provided in the Caltrans GIS Data Library is accurate to the best of our knowledge and is subject to change on a regular basis, without notice. While Caltrans makes every effort to provide useful and accurate information, we do not warrant the information Use Limitation - The data are made available to the public solely for informational purposes. Information provided in the Caltrans GIS Data Library is accurate to the best of our knowledge and is subject to change on a regular basis, without notice. While Caltrans makes every effort to provide useful and accurate information, we do not warrant the information to be authoritative, complete, factual, or timely. Information is provided on an 'as is' and an 'as available' basis. The Department of Transportation is not liable to any party for any cost or damages, including any direct, indirect, special, incidental, or consequential damages, arising out of or in connection with the access or use of, or the inability to access or use, the Site or any of the Materials or Services described herein.", "boilerplate_license": "Creative Commons 4.0 Attribution.", "methodology": "This data was assembled from the General Transit Feed Specification (GTFS) schedule data. GTFS tables are text files, but these have been compiled for all operators and transformed into geospatial data, with minimal data processing. The transit routes dataset is assembled from two tables: (1) `shapes.txt`, which defines the route alignment path, and (2) `trips.txt` and `stops.txt`, for routes not found in `shapes.txt`. `shapes.txt` is an optional GTFS table with richer information than just transit stop longitude and latitude. The transit stops dataset is assembled from `stops.txt`, which contains information about the route, stop sequence, and stop longitude and latitude. References: https://gtfs.org/. https://gtfs.org/schedule/reference/#shapestxt. https://gtfs.org/schedule/reference/#stopstxt. https://gtfs.org/schedule/reference/#tripstxt.", "data_dict_url": "https://gisdata.dot.ca.gov/arcgis/rest/services/CHrailroad/CA_Transit_Stops/FeatureServer", "revision_date": "2024-01-17"}, "speeds_by_stop_segments": {"dataset_name": "speeds_by_stop_segments", "publish_entity": "Data & Digital Services / California Integrated Travel Project", "summary_purpose": "Average all-day, peak, and offpeak transit speeds by segments for all CA operators that provide GTFS real-time vehicle positions data.", "description": "All day and peak transit 20th, 50th, and 80th percentile speeds on stop segments estimated on a single day for all CA transit operators that provide GTFS real-time vehicle positions data.", "public_access": "Public.", "creation_date": "2023-06-14", "place": "California", "status": "completed", "frequency": "monthly", "theme_topic": "transportation", "theme_keywords": "Transportation, Transit, GTFS, GTFS RT, real time, speeds, vehicle positions ", "data_dict_type": "XML", "readme": "https://github.com/cal-itp/data-analyses/blob/main/rt_segment_speeds/README.md", "readme_desc": "This site allows you to access the code used to create this dataset and provides additional explanatory resources.", "contact_organization": "Caltrans", "contact_person": "Tiffany Ku / Eric Dasmalchi", "contact_email": "tiffany.ku@dot.ca.gov / eric.dasmalchi@dot.ca.gov", "horiz_accuracy": "4 meters", "boilerplate_desc": "The data are made available to the public solely for informational purposes. Information provided in the Caltrans GIS Data Library is accurate to the best of our knowledge and is subject to change on a regular basis, without notice. While Caltrans makes every effort to provide useful and accurate information, we do not warrant the information Use Limitation - The data are made available to the public solely for informational purposes. Information provided in the Caltrans GIS Data Library is accurate to the best of our knowledge and is subject to change on a regular basis, without notice. While Caltrans makes every effort to provide useful and accurate information, we do not warrant the information to be authoritative, complete, factual, or timely. Information is provided on an 'as is' and an 'as available' basis. The Department of Transportation is not liable to any party for any cost or damages, including any direct, indirect, special, incidental, or consequential damages, arising out of or in connection with the access or use of, or the inability to access or use, the Site or any of the Materials or Services described herein.", "boilerplate_license": "Creative Commons 4.0 Attribution.", "methodology": "This data was estimated by combining GTFS real-time vehicle positions to GTFS scheduled trips, shapes, stops, and stop times tables. GTFS shapes provides the route alignment path. Multiple trips may share the same shape, with a route typically associated with multiple shapes. Shapes are cut into segments at stop positions (stop_id-stop_sequence combination). A `stop segment` refers to the portion of shapes between the prior stop and the current stop. Vehicle positions are spatially joined to 35 meter buffered segments. Within each segment-trip, the first and last vehicle position observed are used to calculate the speed. Since multiple trips may occur over a segment each day, the multiple trip speeds provide a distribution. From this distribution, the 20th percentile, 50th percentile (median), and 80th percentile speeds are calculated. For all day speed metrics, all trips are used. For peak speed metrics, only trips with start times between 7 - 9:59 AM and 4 - 7:59 PM are used to find the 20th, 50th, and 80th percentile metrics. Data processing notes: (a) GTFS RT trips whose vehicle position timestamps span 10 minutes or less are dropped. Incomplete data would lead to unreliable estimates of speed at the granularity we need. (b) Segment-trip speeds of over 70 mph are excluded. These are erroneously calculated as transit does not typically reach those speeds. (c) Other missing or erroneous calculations, either arising from only one vehicle position found in a segment (change in time or change in distance cannot be calculated).", "data_dict_url": "https://gisdata.dot.ca.gov/arcgis/rest/services/CHrailroad/Speeds_By_Stop_Segments/FeatureServer", "revision_date": "2024-01-17", "rename_cols": {"stop_seque": "stop_sequence", "time_of_da": "time_of_day", "time_perio": "time_period", "district_n": "district_name", "direction_": "direction_id", "common_sha": "common_shape_id", "avg_sched_": "avg_sched_trip_min", "avg_rt_tri": "avg_rt_trip_min", "caltrans_d": "district_name"}}, "speeds_by_route_time_of_day": {"dataset_name": "speeds_by_route_time_of_day", "publish_entity": "Data & Digital Services / California Integrated Travel Project", "summary_purpose": "Average transit speeds by route-direction estimated on a single day for all CA transit operators that provide GTFS real-time vehicle positions data.", "description": "Provide average transit speeds, number of trips by route-direction.", "public_access": "Public.", "creation_date": "2023-06-14", "place": "California", "status": "completed", "frequency": "monthly", "theme_topic": "transportation", "theme_keywords": "Transportation, Transit, GTFS, GTFS RT, real time, speeds, vehicle positions ", "data_dict_type": "XML", "readme": "https://github.com/cal-itp/data-analyses/blob/main/rt_segment_speeds/README.md", "readme_desc": "This site allows you to access the code used to create this dataset and provides additional explanatory resources.", "contact_organization": "Caltrans", "contact_person": "Tiffany Ku", "contact_email": "tiffany.ku@dot.ca.gov", "horiz_accuracy": "4 meters", "boilerplate_desc": "The data are made available to the public solely for informational purposes. Information provided in the Caltrans GIS Data Library is accurate to the best of our knowledge and is subject to change on a regular basis, without notice. While Caltrans makes every effort to provide useful and accurate information, we do not warrant the information Use Limitation - The data are made available to the public solely for informational purposes. Information provided in the Caltrans GIS Data Library is accurate to the best of our knowledge and is subject to change on a regular basis, without notice. While Caltrans makes every effort to provide useful and accurate information, we do not warrant the information to be authoritative, complete, factual, or timely. Information is provided on an 'as is' and an 'as available' basis. The Department of Transportation is not liable to any party for any cost or damages, including any direct, indirect, special, incidental, or consequential damages, arising out of or in connection with the access or use of, or the inability to access or use, the Site or any of the Materials or Services described herein.", "boilerplate_license": "Creative Commons 4.0 Attribution.", "methodology": "This data was estimated by combining GTFS real-time vehicle positions with GTFS scheduled trips and shapes. GTFS real-time (RT) vehicle positions are spatially joined to GTFS scheduled shapes, so only vehicle positions traveling along the route alignment path are kept. A sample of five vehicle positions are selected (min, 25th percentile, 50th percentile, 75th percentile, max). The trip speed is calculated using these five vehicle positions. Each trip is categorized into a time-of-day. The average speed for a route-direction-time_of_day is calculated. Additional metrics are stored, such as the number of trips observed, the average scheduled service minutes, and the average RT observed service minutes. For convenience, we also provide a singular shape (common_shape_id) to associate with a route-direction. This is the shape that had the most number of trips for a given route-direction. Time-of-day is determined by the GTFS scheduled trip start time. The trip start hour (military time) is categorized based on the following: Owl (0-3), Early AM (4-6), AM Peak (7-9), Midday (10-14), PM Peak (15-19), and Evening (20-23). The start and end hours are inclusive (e.g., 4-6 refers to 4am, 5am, and 6am).", "data_dict_url": "https://gisdata.dot.ca.gov/arcgis/rest/services/CHrailroad/Speeds_By_Route_Time_of_Day/FeatureServer", "revision_date": "2024-01-17", "rename_cols": {"stop_seque": "stop_sequence", "time_of_da": "time_of_day", "time_perio": "time_period", "district_n": "district_name", "direction_": "direction_id", "common_sha": "common_shape_id", "avg_sched_": "avg_sched_trip_min", "avg_rt_tri": "avg_rt_trip_min", "caltrans_d": "district_name"}}} \ No newline at end of file +{"ca_hq_transit_areas": {"dataset_name": "ca_hq_transit_areas", "publish_entity": "Data & Digital Services / California Integrated Travel Project", "summary_purpose": "Estimated High Quality Transit Areas as described in Public Resources Code 21155, 21064.3, 21060.2.", "description": "Use GTFS schedule trips, stop_times, shapes, and stops to estimate whether corridor segments have scheduled frequencies of 15 minutes or less.", "public_access": "Public.", "creation_date": "2022-02-08", "place": "California", "status": "completed", "frequency": "monthly", "theme_topic": "transportation", "theme_keywords": "Transportation, Land Use, Transit-Oriented Development, TOD, High Quality Transit", "data_dict_type": "XML", "readme": "https://github.com/cal-itp/data-analyses/blob/main/high_quality_transit_areas/README.md", "readme_desc": "This site allows you to access the code used to create this dataset and provides additional explanatory resources.", "contact_organization": "Caltrans", "contact_person": "Eric Dasmalchi", "contact_email": "eric.dasmalchi@dot.ca.gov", "horiz_accuracy": "4 meters", "boilerplate_desc": "The data are made available to the public solely for informational purposes. Information provided in the Caltrans GIS Data Library is accurate to the best of our knowledge and is subject to change on a regular basis, without notice. While Caltrans makes every effort to provide useful and accurate information, we do not warrant the information Use Limitation - The data are made available to the public solely for informational purposes. Information provided in the Caltrans GIS Data Library is accurate to the best of our knowledge and is subject to change on a regular basis, without notice. While Caltrans makes every effort to provide useful and accurate information, we do not warrant the information to be authoritative, complete, factual, or timely. Information is provided on an 'as is' and an 'as available' basis. The Department of Transportation is not liable to any party for any cost or damages, including any direct, indirect, special, incidental, or consequential damages, arising out of or in connection with the access or use of, or the inability to access or use, the Site or any of the Materials or Services described herein.", "boilerplate_license": "Creative Commons 4.0 Attribution.", "methodology": "This data was estimated using a spatial process derived from General Transit Feed Specification (GTFS) schedule data. To find high-quality bus corridors, we split each corridor into 1,500 meter segments and counted frequencies at the stop within that segment with the highest number of transit trips. If that stop saw at least 4 trips per hour for at least one hour in the morning, and again for at least one hour in the afternoon, we consider that segment a high-quality bus corridor. Segments without a stop are not considered high-quality corridors. Major transit stops were identified as either the intersection of two high-quality corridors from the previous step, a rail or bus rapid transit station, or a ferry terminal with bus service. Note that the definition of `bus rapid transit` in Public Resources Code 21060.2 includes features not captured by available data sources, these features were captured manually using information from transit agency sources and imagery. We believe this data to be broadly accurate, and fit for purposes including overall dashboards, locating facilities in relation to high quality transit areas, and assessing community transit coverage. However, the spatial determination of high-quality transit areas from GTFS data necessarily involves some assumptions as described above. Any critical determinations of whether a specific parcel is located within a high-quality transit area should be made in conjunction with local sources, such as transit agency timetables. Notes: Null values may be present. The `hqta_details` columns defines which part of the Public Resources Code definition the HQTA classification was based on. If `hqta_details` references a single operator, then `agency_secondary` and `base64_url_secondary` are null. If `hqta_details` references the same operator, then `agency_secondary` and `base64_url_secondary` are the same as `agency_primary` and `base64_url_primary`.", "data_dict_url": "https://gisdata.dot.ca.gov/arcgis/rest/services/CHrailroad/CA_HQ_Transit_Areas/FeatureServer", "revision_date": "2024-02-14", "rename_cols": {"agency_pri": "agency_primary", "agency_sec": "agency_secondary", "hqta_detai": "hqta_details", "base64_url": "base64_url_primary", "base64_u_1": "base64_url_secondary", "org_id_pri": "org_id_primary", "org_id_sec": "org_id_secondary"}}, "ca_hq_transit_stops": {"dataset_name": "ca_hq_transit_stops", "publish_entity": "Data & Digital Services / California Integrated Travel Project", "summary_purpose": "Estimated stops along High Quality Transit Corridors, plus major transit stops for bus rapid transit, ferry, rail modes as described in Public Resources Code 21155, 21064.3, 21060.2.", "description": "Use GTFS schedule trips, stop_times, shapes, and stops to estimate whether corridor segments have scheduled frequencies of 15 minutes or less.", "public_access": "Public.", "creation_date": "2022-02-08", "place": "California", "status": "completed", "frequency": "monthly", "theme_topic": "transportation", "theme_keywords": "Transportation, Land Use, Transit-Oriented Development, TOD, High Quality Transit", "data_dict_type": "XML", "readme": "https://github.com/cal-itp/data-analyses/blob/main/high_quality_transit_areas/README.md", "readme_desc": "This site allows you to access the code used to create this dataset and provides additional explanatory resources.", "contact_organization": "Caltrans", "contact_person": "Eric Dasmalchi", "contact_email": "eric.dasmalchi@dot.ca.gov", "horiz_accuracy": "4 meters", "boilerplate_desc": "The data are made available to the public solely for informational purposes. Information provided in the Caltrans GIS Data Library is accurate to the best of our knowledge and is subject to change on a regular basis, without notice. While Caltrans makes every effort to provide useful and accurate information, we do not warrant the information Use Limitation - The data are made available to the public solely for informational purposes. Information provided in the Caltrans GIS Data Library is accurate to the best of our knowledge and is subject to change on a regular basis, without notice. While Caltrans makes every effort to provide useful and accurate information, we do not warrant the information to be authoritative, complete, factual, or timely. Information is provided on an 'as is' and an 'as available' basis. The Department of Transportation is not liable to any party for any cost or damages, including any direct, indirect, special, incidental, or consequential damages, arising out of or in connection with the access or use of, or the inability to access or use, the Site or any of the Materials or Services described herein.", "boilerplate_license": "Creative Commons 4.0 Attribution.", "methodology": "This data was estimated using a spatial process derived from General Transit Feed Specification (GTFS) schedule data. To find high-quality bus corridors, we split each corridor into 1,500 meter segments and counted frequencies at the stop within that segment with the highest number of transit trips. If that stop saw at least 4 trips per hour for at least one hour in the morning, and again for at least one hour in the afternoon, we consider that segment a high-quality bus corridor. Segments without a stop are not considered high-quality corridors. Major transit stops were identified as either the intersection of two high-quality corridors from the previous step, a rail or bus rapid transit station, or a ferry terminal with bus service. Note that the definition of `bus rapid transit` in Public Resources Code 21060.2 includes features not captured by available data sources, these features were captured manually using information from transit agency sources and imagery. We believe this data to be broadly accurate, and fit for purposes including overall dashboards, locating facilities in relation to high quality transit areas, and assessing community transit coverage. However, the spatial determination of high-quality transit areas from GTFS data necessarily involves some assumptions as described above. Any critical determinations of whether a specific parcel is located within a high-quality transit area should be made in conjunction with local sources, such as transit agency timetables. Notes: Null values may be present. The `hqta_details` columns defines which part of the Public Resources Code definition the HQTA classification was based on. If `hqta_details` references a single operator, then `agency_secondary` and `base64_url_secondary` are null. If `hqta_details` references the same operator, then `agency_secondary` and `base64_url_secondary` are the same as `agency_primary` and `base64_url_primary`.", "data_dict_url": "https://gisdata.dot.ca.gov/arcgis/rest/services/CHrailroad/CA_HQ_Transit_Stops/FeatureServer", "revision_date": "2024-02-14", "rename_cols": {"agency_pri": "agency_primary", "agency_sec": "agency_secondary", "hqta_detai": "hqta_details", "base64_url": "base64_url_primary", "base64_u_1": "base64_url_secondary", "org_id_pri": "org_id_primary", "org_id_sec": "org_id_secondary"}}, "ca_transit_routes": {"dataset_name": "ca_transit_routes", "publish_entity": "Data & Digital Services / California Integrated Travel Project", "summary_purpose": "Provide all CA transit stops and routes (geospatial) from all transit operators.", "description": "Provide compiled GTFS schedule data in geospatial format. Transit routes associates route information to shapes. Transit stops associates route information to stops.", "public_access": "Public.", "creation_date": "2022-02-08", "place": "California", "status": "completed", "frequency": "monthly", "theme_topic": "transportation", "theme_keywords": "Transportation, GTFS, Transit routes, Transit stops, Transit", "data_dict_type": "XML", "readme": "https://github.com/cal-itp/data-analyses/blob/main/open_data/README.md", "readme_desc": "This site allows you to access the code used to create this dataset and provides additional explanatory resources.", "contact_organization": "Caltrans", "contact_person": "Tiffany Ku", "contact_email": "tiffany.ku@dot.ca.gov", "horiz_accuracy": "4 meters", "boilerplate_desc": "The data are made available to the public solely for informational purposes. Information provided in the Caltrans GIS Data Library is accurate to the best of our knowledge and is subject to change on a regular basis, without notice. While Caltrans makes every effort to provide useful and accurate information, we do not warrant the information Use Limitation - The data are made available to the public solely for informational purposes. Information provided in the Caltrans GIS Data Library is accurate to the best of our knowledge and is subject to change on a regular basis, without notice. While Caltrans makes every effort to provide useful and accurate information, we do not warrant the information to be authoritative, complete, factual, or timely. Information is provided on an 'as is' and an 'as available' basis. The Department of Transportation is not liable to any party for any cost or damages, including any direct, indirect, special, incidental, or consequential damages, arising out of or in connection with the access or use of, or the inability to access or use, the Site or any of the Materials or Services described herein.", "boilerplate_license": "Creative Commons 4.0 Attribution.", "methodology": "This data was assembled from the General Transit Feed Specification (GTFS) schedule data. GTFS tables are text files, but these have been compiled for all operators and transformed into geospatial data, with minimal data processing. The transit routes dataset is assembled from two tables: (1) `shapes.txt`, which defines the route alignment path, and (2) `trips.txt` and `stops.txt`, for routes not found in `shapes.txt`. `shapes.txt` is an optional GTFS table with richer information than just transit stop longitude and latitude. The transit stops dataset is assembled from `stops.txt`, which contains information about the route, stop sequence, and stop longitude and latitude. References: https://gtfs.org/. https://gtfs.org/schedule/reference/#shapestxt. https://gtfs.org/schedule/reference/#stopstxt. https://gtfs.org/schedule/reference/#tripstxt.", "data_dict_url": "https://gisdata.dot.ca.gov/arcgis/rest/services/CHrailroad/CA_Transit_Routes/FeatureServer", "revision_date": "2024-02-14"}, "ca_transit_stops": {"dataset_name": "ca_transit_stops", "publish_entity": "Data & Digital Services / California Integrated Travel Project", "summary_purpose": "Provide all CA transit stops and routes (geospatial) from all transit operators.", "description": "Provide compiled GTFS schedule data in geospatial format. Transit routes associates route information to shapes. Transit stops associates route information to stops.", "public_access": "Public.", "creation_date": "2022-02-08", "place": "California", "status": "completed", "frequency": "monthly", "theme_topic": "transportation", "theme_keywords": "Transportation, GTFS, Transit routes, Transit stops, Transit", "data_dict_type": "XML", "readme": "https://github.com/cal-itp/data-analyses/blob/main/open_data/README.md", "readme_desc": "This site allows you to access the code used to create this dataset and provides additional explanatory resources.", "contact_organization": "Caltrans", "contact_person": "Tiffany Ku", "contact_email": "tiffany.ku@dot.ca.gov", "horiz_accuracy": "4 meters", "boilerplate_desc": "The data are made available to the public solely for informational purposes. Information provided in the Caltrans GIS Data Library is accurate to the best of our knowledge and is subject to change on a regular basis, without notice. While Caltrans makes every effort to provide useful and accurate information, we do not warrant the information Use Limitation - The data are made available to the public solely for informational purposes. Information provided in the Caltrans GIS Data Library is accurate to the best of our knowledge and is subject to change on a regular basis, without notice. While Caltrans makes every effort to provide useful and accurate information, we do not warrant the information to be authoritative, complete, factual, or timely. Information is provided on an 'as is' and an 'as available' basis. The Department of Transportation is not liable to any party for any cost or damages, including any direct, indirect, special, incidental, or consequential damages, arising out of or in connection with the access or use of, or the inability to access or use, the Site or any of the Materials or Services described herein.", "boilerplate_license": "Creative Commons 4.0 Attribution.", "methodology": "This data was assembled from the General Transit Feed Specification (GTFS) schedule data. GTFS tables are text files, but these have been compiled for all operators and transformed into geospatial data, with minimal data processing. The transit routes dataset is assembled from two tables: (1) `shapes.txt`, which defines the route alignment path, and (2) `trips.txt` and `stops.txt`, for routes not found in `shapes.txt`. `shapes.txt` is an optional GTFS table with richer information than just transit stop longitude and latitude. The transit stops dataset is assembled from `stops.txt`, which contains information about the route, stop sequence, and stop longitude and latitude. References: https://gtfs.org/. https://gtfs.org/schedule/reference/#shapestxt. https://gtfs.org/schedule/reference/#stopstxt. https://gtfs.org/schedule/reference/#tripstxt.", "data_dict_url": "https://gisdata.dot.ca.gov/arcgis/rest/services/CHrailroad/CA_Transit_Stops/FeatureServer", "revision_date": "2024-02-14"}, "speeds_by_stop_segments": {"dataset_name": "speeds_by_stop_segments", "publish_entity": "Data & Digital Services / California Integrated Travel Project", "summary_purpose": "Average all-day, peak, and offpeak transit speeds by segments for all CA operators that provide GTFS real-time vehicle positions data.", "description": "All day and peak transit 20th, 50th, and 80th percentile speeds on stop segments estimated on a single day for all CA transit operators that provide GTFS real-time vehicle positions data.", "public_access": "Public.", "creation_date": "2023-06-14", "place": "California", "status": "completed", "frequency": "monthly", "theme_topic": "transportation", "theme_keywords": "Transportation, Transit, GTFS, GTFS RT, real time, speeds, vehicle positions ", "data_dict_type": "XML", "readme": "https://github.com/cal-itp/data-analyses/blob/main/rt_segment_speeds/README.md", "readme_desc": "This site allows you to access the code used to create this dataset and provides additional explanatory resources.", "contact_organization": "Caltrans", "contact_person": "Tiffany Ku / Eric Dasmalchi", "contact_email": "tiffany.ku@dot.ca.gov / eric.dasmalchi@dot.ca.gov", "horiz_accuracy": "4 meters", "boilerplate_desc": "The data are made available to the public solely for informational purposes. Information provided in the Caltrans GIS Data Library is accurate to the best of our knowledge and is subject to change on a regular basis, without notice. While Caltrans makes every effort to provide useful and accurate information, we do not warrant the information Use Limitation - The data are made available to the public solely for informational purposes. Information provided in the Caltrans GIS Data Library is accurate to the best of our knowledge and is subject to change on a regular basis, without notice. While Caltrans makes every effort to provide useful and accurate information, we do not warrant the information to be authoritative, complete, factual, or timely. Information is provided on an 'as is' and an 'as available' basis. The Department of Transportation is not liable to any party for any cost or damages, including any direct, indirect, special, incidental, or consequential damages, arising out of or in connection with the access or use of, or the inability to access or use, the Site or any of the Materials or Services described herein.", "boilerplate_license": "Creative Commons 4.0 Attribution.", "methodology": "This data was estimated by combining GTFS real-time vehicle positions to GTFS scheduled trips, shapes, stops, and stop times tables. GTFS shapes provides the route alignment path. Multiple trips may share the same shape, with a route typically associated with multiple shapes. Shapes are cut into segments at stop positions (stop_id-stop_sequence combination). A `stop segment` refers to the portion of shapes between the prior stop and the current stop. Vehicle positions are spatially joined to 35 meter buffered segments. Within each segment-trip, the first and last vehicle position observed are used to calculate the speed. Since multiple trips may occur over a segment each day, the multiple trip speeds provide a distribution. From this distribution, the 20th percentile, 50th percentile (median), and 80th percentile speeds are calculated. For all day speed metrics, all trips are used. For peak speed metrics, only trips with start times between 7 - 9:59 AM and 4 - 7:59 PM are used to find the 20th, 50th, and 80th percentile metrics. Data processing notes: (a) GTFS RT trips whose vehicle position timestamps span 10 minutes or less are dropped. Incomplete data would lead to unreliable estimates of speed at the granularity we need. (b) Segment-trip speeds of over 70 mph are excluded. These are erroneously calculated as transit does not typically reach those speeds. (c) Other missing or erroneous calculations, either arising from only one vehicle position found in a segment (change in time or change in distance cannot be calculated).", "data_dict_url": "https://gisdata.dot.ca.gov/arcgis/rest/services/CHrailroad/Speeds_By_Stop_Segments/FeatureServer", "revision_date": "2024-02-14", "rename_cols": {"stop_seque": "stop_sequence", "time_of_da": "time_of_day", "time_perio": "time_period", "district_n": "district_name", "direction_": "direction_id", "common_sha": "common_shape_id", "avg_sched_": "avg_sched_trip_min", "avg_rt_tri": "avg_rt_trip_min", "caltrans_d": "district_name", "organization_source_record_id": "org_id", "organization_name": "agency"}}, "speeds_by_route_time_of_day": {"dataset_name": "speeds_by_route_time_of_day", "publish_entity": "Data & Digital Services / California Integrated Travel Project", "summary_purpose": "Average transit speeds by route-direction estimated on a single day for all CA transit operators that provide GTFS real-time vehicle positions data.", "description": "Provide average transit speeds, number of trips by route-direction.", "public_access": "Public.", "creation_date": "2023-06-14", "place": "California", "status": "completed", "frequency": "monthly", "theme_topic": "transportation", "theme_keywords": "Transportation, Transit, GTFS, GTFS RT, real time, speeds, vehicle positions ", "data_dict_type": "XML", "readme": "https://github.com/cal-itp/data-analyses/blob/main/rt_segment_speeds/README.md", "readme_desc": "This site allows you to access the code used to create this dataset and provides additional explanatory resources.", "contact_organization": "Caltrans", "contact_person": "Tiffany Ku", "contact_email": "tiffany.ku@dot.ca.gov", "horiz_accuracy": "4 meters", "boilerplate_desc": "The data are made available to the public solely for informational purposes. Information provided in the Caltrans GIS Data Library is accurate to the best of our knowledge and is subject to change on a regular basis, without notice. While Caltrans makes every effort to provide useful and accurate information, we do not warrant the information Use Limitation - The data are made available to the public solely for informational purposes. Information provided in the Caltrans GIS Data Library is accurate to the best of our knowledge and is subject to change on a regular basis, without notice. While Caltrans makes every effort to provide useful and accurate information, we do not warrant the information to be authoritative, complete, factual, or timely. Information is provided on an 'as is' and an 'as available' basis. The Department of Transportation is not liable to any party for any cost or damages, including any direct, indirect, special, incidental, or consequential damages, arising out of or in connection with the access or use of, or the inability to access or use, the Site or any of the Materials or Services described herein.", "boilerplate_license": "Creative Commons 4.0 Attribution.", "methodology": "This data was estimated by combining GTFS real-time vehicle positions with GTFS scheduled trips and shapes. GTFS real-time (RT) vehicle positions are spatially joined to GTFS scheduled shapes, so only vehicle positions traveling along the route alignment path are kept. A sample of five vehicle positions are selected (min, 25th percentile, 50th percentile, 75th percentile, max). The trip speed is calculated using these five vehicle positions. Each trip is categorized into a time-of-day. The average speed for a route-direction-time_of_day is calculated. Additional metrics are stored, such as the number of trips observed, the average scheduled service minutes, and the average RT observed service minutes. For convenience, we also provide a singular shape (common_shape_id) to associate with a route-direction. This is the shape that had the most number of trips for a given route-direction. Time-of-day is determined by the GTFS scheduled trip start time. The trip start hour (military time) is categorized based on the following: Owl (0-3), Early AM (4-6), AM Peak (7-9), Midday (10-14), PM Peak (15-19), and Evening (20-23). The start and end hours are inclusive (e.g., 4-6 refers to 4am, 5am, and 6am).", "data_dict_url": "https://gisdata.dot.ca.gov/arcgis/rest/services/CHrailroad/Speeds_By_Route_Time_of_Day/FeatureServer", "revision_date": "2024-02-14", "rename_cols": {"stop_seque": "stop_sequence", "time_of_da": "time_of_day", "time_perio": "time_period", "district_n": "district_name", "direction_": "direction_id", "common_sha": "common_shape_id", "avg_sched_": "avg_sched_trip_min", "avg_rt_tri": "avg_rt_trip_min", "caltrans_d": "district_name", "organization_source_record_id": "org_id", "organization_name": "agency"}}} \ No newline at end of file diff --git a/open_data/open_data.py b/open_data/open_data.py index bad7e3bce..329d41e91 100644 --- a/open_data/open_data.py +++ b/open_data/open_data.py @@ -4,16 +4,8 @@ from pathlib import Path import metadata_update_pro -from update_vars import XML_FOLDER, META_JSON +from update_vars import XML_FOLDER, META_JSON, RUN_ME -RUN_ME = [ - "ca_hq_transit_areas", - "ca_hq_transit_stops", - "ca_transit_routes", - "ca_transit_stops", - "speeds_by_stop_segments", - "speeds_by_route_time_of_day", -] if __name__=="__main__": assert str(Path.cwd()).endswith("open_data"), "this script must be run from open_data directory!" diff --git a/open_data/publish_utils.py b/open_data/publish_utils.py new file mode 100644 index 000000000..4222bde28 --- /dev/null +++ b/open_data/publish_utils.py @@ -0,0 +1,61 @@ +import geopandas as gpd +import pandas as pd + +STANDARDIZED_COLUMNS_DICT = { + "caltrans_district": "district_name", + "organization_source_record_id": "org_id", + "organization_name": "agency", + "agency_name_primary": "agency_primary", + "agency_name_secondary": "agency_secondary" +} + + +# Rename columns when shapefile truncates +RENAME_HQTA = { + "agency_pri": "agency_primary", + "agency_sec": "agency_secondary", + "hqta_detai": "hqta_details", + "base64_url": "base64_url_primary", + "base64_u_1": "base64_url_secondary", + "org_id_pri": "org_id_primary", + "org_id_sec": "org_id_secondary", +} + +RENAME_SPEED = { + "stop_seque": "stop_sequence", + "time_of_da": "time_of_day", + "time_perio": "time_period", + "district_n": "district_name", + "direction_": "direction_id", + "common_sha": "common_shape_id", + "avg_sched_": "avg_sched_trip_min", + "avg_rt_tri": "avg_rt_trip_min", + "caltrans_d": "district_name", + "organization_source_record_id": "org_id", + "organization_name": "agency" +} + +def standardize_column_names(df: gpd.GeoDataFrame) -> gpd.GeoDataFrame: + """ + Standardize how agency is referred to. + """ + return df.rename(columns = STANDARDIZED_COLUMNS_DICT) + + +def remove_internal_keys(df: gpd.GeoDataFrame) -> gpd.GeoDataFrame: + """ + Remove columns used in our internal data modeling. + Leave only natural identifiers (route_id, shape_id). + Remove shape_array_key, gtfs_dataset_key, etc. + """ + exclude_list = [ + "sec_elapsed", "meters_elapsed", + "name", "schedule_gtfs_dataset_key" + ] + cols = [c for c in df.columns] + + internal_cols = [c for c in cols if "_key" in c or c in exclude_list] + + print(f"drop: {internal_cols}") + + return df.drop(columns = internal_cols) \ No newline at end of file diff --git a/open_data/supplement_meta.py b/open_data/supplement_meta.py index 45ca4734c..edda0d4c2 100644 --- a/open_data/supplement_meta.py +++ b/open_data/supplement_meta.py @@ -9,7 +9,7 @@ from calitp_data_analysis import utils from update_vars import analysis_date, ESRI_BASE_URL - +from publish_utils import RENAME_HQTA, RENAME_SPEED def get_esri_url(name: str)-> str: return f"{ESRI_BASE_URL}{name}/FeatureServer" @@ -25,30 +25,6 @@ def get_esri_url(name: str)-> str: ROUTE_METHODOLOGY = "This data was estimated by combining GTFS real-time vehicle positions with GTFS scheduled trips and shapes. GTFS real-time (RT) vehicle positions are spatially joined to GTFS scheduled shapes, so only vehicle positions traveling along the route alignment path are kept. A sample of five vehicle positions are selected (min, 25th percentile, 50th percentile, 75th percentile, max). The trip speed is calculated using these five vehicle positions. Each trip is categorized into a time-of-day. The average speed for a route-direction-time_of_day is calculated. Additional metrics are stored, such as the number of trips observed, the average scheduled service minutes, and the average RT observed service minutes. For convenience, we also provide a singular shape (common_shape_id) to associate with a route-direction. This is the shape that had the most number of trips for a given route-direction. Time-of-day is determined by the GTFS scheduled trip start time. The trip start hour (military time) is categorized based on the following: Owl (0-3), Early AM (4-6), AM Peak (7-9), Midday (10-14), PM Peak (15-19), and Evening (20-23). The start and end hours are inclusive (e.g., 4-6 refers to 4am, 5am, and 6am)." -#--------------------------------------------------------# -# Rename columns -#--------------------------------------------------------# -RENAME_HQTA = { - "agency_pri": "agency_primary", - "agency_sec": "agency_secondary", - "hqta_detai": "hqta_details", - "base64_url": "base64_url_primary", - "base64_u_1": "base64_url_secondary", - "org_id_pri": "org_id_primary", - "org_id_sec": "org_id_secondary", -} - -RENAME_SPEED = { - "stop_seque": "stop_sequence", - "time_of_da": "time_of_day", - "time_perio": "time_period", - "district_n": "district_name", - "direction_": "direction_id", - "common_sha": "common_shape_id", - "avg_sched_": "avg_sched_trip_min", - "avg_rt_tri": "avg_rt_trip_min", - "caltrans_d": "district_name" -} #--------------------------------------------------------# # Put supplemental parts together into dict @@ -95,7 +71,7 @@ def get_esri_url(name: str)-> str: METADATA_FILE = "metadata.yml" - with open(f"./{METADATA_FILE}") as f: + with open(METADATA_FILE) as f: meta = yaml.load(f, yaml.Loader) # The dictionaries for each dataset are stored in a list @@ -122,7 +98,7 @@ def get_esri_url(name: str)-> str: # Output a json to use in ArcPro, and only of the subset of dict that's meta["tables"] JSON_FILE = utils.sanitize_file_path(METADATA_FILE) - with open(f"./{JSON_FILE}.json", 'w') as f: + with open(f"{JSON_FILE}.json", 'w') as f: json.dump(output, f) print(f"{JSON_FILE} produced") \ No newline at end of file diff --git a/open_data/update_data_dict.py b/open_data/update_data_dict.py index 4749a70b1..def688f64 100644 --- a/open_data/update_data_dict.py +++ b/open_data/update_data_dict.py @@ -7,11 +7,11 @@ from pathlib import Path from typing import Union +import publish_utils from update_vars import analysis_date catalog = intake.open_catalog("catalog.yml") - def unpack_list_of_tables_as_dict(list_of_dict: list) -> dict: """ In the yml, the datasets come as a list of dictionary items. @@ -54,8 +54,18 @@ def new_columns_for_data_dict( # Columns in our dataset FILE = catalog[t].urlpath - col_list = gpd.read_parquet(FILE).columns.tolist() - + gdf = gpd.read_parquet(FILE).pipe( + publish_utils.standardize_column_names + ).pipe( + publish_utils.remove_internal_keys) + + if "hq_" in t: + gdf = gdf.rename(columns = publish_utils.RENAME_HQTA) + elif "speed" in t: + gdf = gdf.rename(columns = publish_utils.RENAME_SPEED) + + col_list = gdf.columns.tolist() + # Columns included in data dictionary cols_defined = [c for c in dict_of_tables[t].keys()] diff --git a/open_data/update_vars.py b/open_data/update_vars.py index 42be4729f..3add72c13 100644 --- a/open_data/update_vars.py +++ b/open_data/update_vars.py @@ -13,4 +13,13 @@ XML_FOLDER = Path("xml") DEFAULT_XML_TEMPLATE = XML_FOLDER.joinpath(Path("default_pro.xml")) META_JSON = Path("metadata.json") -DATA_DICT_YML = Path("data_dictionary.yml") \ No newline at end of file +DATA_DICT_YML = Path("data_dictionary.yml") + +RUN_ME = [ + "ca_hq_transit_areas", + "ca_hq_transit_stops", + "ca_transit_routes", + "ca_transit_stops", + "speeds_by_stop_segments", + "speeds_by_route_time_of_day", +] \ No newline at end of file diff --git a/open_data/xml/ca_hq_transit_areas.xml b/open_data/xml/ca_hq_transit_areas.xml index 4ed905d61..1068376e0 100644 --- a/open_data/xml/ca_hq_transit_areas.xml +++ b/open_data/xml/ca_hq_transit_areas.xml @@ -20,7 +20,7 @@ - 2024-01-26 + 2024-02-23 ISO 19139 Geographic Information - Metadata - Implementation Specification @@ -85,7 +85,7 @@ - 2024-01-17 + 2024-02-14 diff --git a/open_data/xml/ca_hq_transit_stops.xml b/open_data/xml/ca_hq_transit_stops.xml index 2cbbe7003..625c891da 100644 --- a/open_data/xml/ca_hq_transit_stops.xml +++ b/open_data/xml/ca_hq_transit_stops.xml @@ -20,7 +20,7 @@ - 2024-01-26 + 2024-02-23 ISO 19139 Geographic Information - Metadata - Implementation Specification @@ -85,7 +85,7 @@ - 2024-01-17 + 2024-02-14 diff --git a/open_data/xml/ca_transit_routes.xml b/open_data/xml/ca_transit_routes.xml index a4b84ba70..32da41b3b 100644 --- a/open_data/xml/ca_transit_routes.xml +++ b/open_data/xml/ca_transit_routes.xml @@ -20,7 +20,7 @@ - 2024-01-26 + 2024-02-23 ISO 19139 Geographic Information - Metadata - Implementation Specification @@ -85,7 +85,7 @@ - 2024-01-17 + 2024-02-14 diff --git a/open_data/xml/ca_transit_stops.xml b/open_data/xml/ca_transit_stops.xml index 5c7a70433..dbdb25fbc 100644 --- a/open_data/xml/ca_transit_stops.xml +++ b/open_data/xml/ca_transit_stops.xml @@ -20,7 +20,7 @@ - 2024-01-26 + 2024-02-23 ISO 19139 Geographic Information - Metadata - Implementation Specification @@ -85,7 +85,7 @@ - 2024-01-17 + 2024-02-14 diff --git a/open_data/xml/speeds_by_route_time_of_day.xml b/open_data/xml/speeds_by_route_time_of_day.xml index ef3fac639..6b27e01b8 100644 --- a/open_data/xml/speeds_by_route_time_of_day.xml +++ b/open_data/xml/speeds_by_route_time_of_day.xml @@ -20,7 +20,7 @@ - 2024-01-26 + 2024-02-23 ISO 19139 Geographic Information - Metadata - Implementation Specification @@ -85,7 +85,7 @@ - 2024-01-17 + 2024-02-14 diff --git a/open_data/xml/speeds_by_stop_segments.xml b/open_data/xml/speeds_by_stop_segments.xml index 03345a003..ae549bb58 100644 --- a/open_data/xml/speeds_by_stop_segments.xml +++ b/open_data/xml/speeds_by_stop_segments.xml @@ -20,7 +20,7 @@ - 2024-01-26 + 2024-02-23 ISO 19139 Geographic Information - Metadata - Implementation Specification @@ -85,7 +85,7 @@ - 2024-01-17 + 2024-02-14