diff --git a/docs/tutorials/osm/index.qmd b/docs/tutorials/osm/index.qmd index 4de5a910..769d9bab 100644 --- a/docs/tutorials/osm/index.qmd +++ b/docs/tutorials/osm/index.qmd @@ -128,7 +128,7 @@ xmin, ymin, xmax, ymax format. Call the list `BBOX_LIST`. BBOX_LIST = [] ``` -### Exercise +### Solution ```{python} BBOX_LIST = [-3.002175, 51.587035, -2.994271, 51.59095] @@ -353,6 +353,14 @@ To read more on `osmosis` filtering strategies, refer to the `completeWays` and `completeRelations` flag descriptions in the [Osmosis detailed usage documentation](https://wiki.openstreetmap.org/wiki/Osmosis/Detailed_Usage_0.48). + +Note that additional metadata can be added to the map by setting `include_tags=True`. Adding this rich contextual data to the map can be useful but is also computationally expensive. This operation should be avoided for large osm files, for example anything over 500 KB. + +```{python} +loc_finder.plot_ids(id_finder.id_dict["way_ids"], feature_type="way", include_tags=True) + +``` + ## Conclusion Congratulations, you have successfully completed this tutorial on OpenStreetMap diff --git a/requirements.txt b/requirements.txt index 166cf719..4ca7f409 100644 --- a/requirements.txt +++ b/requirements.txt @@ -28,11 +28,12 @@ seaborn haversine pretty_html_table kaleido -numpy>=1.25.0 # test suite will fail if user installed lower than this +numpy==1.26.4 # ERROR - ValueError: numpy.dtype size changed, may indicate binary incompatibility. Expected 96 from C header, got 88 from PyObject dask[dataframe] -pyarrow >= 14.0.1 # 14.0.0 has security vulnerability +pyarrow>=14.0.1 # 14.0.0 has security vulnerability osmium # has dependencies on `cmake` and `boost` which require brew install tqdm quartodoc jupyterlab +xyzservices -e . diff --git a/src/transport_performance/osm/validate_osm.py b/src/transport_performance/osm/validate_osm.py index 88628814..fbc439e7 100644 --- a/src/transport_performance/osm/validate_osm.py +++ b/src/transport_performance/osm/validate_osm.py @@ -18,6 +18,8 @@ * Find coordinates for node or way features * Plot the coordinates of a given list of node or way IDs """ +import os +import warnings from pathlib import Path from typing import Union @@ -26,6 +28,7 @@ import osmium import pandas as pd from shapely import Point +from xyzservices import TileProvider from transport_performance.utils.defence import ( _check_item_in_iter, @@ -37,6 +40,12 @@ # ---------utilities----------- +class PerformanceWarning(Warning): + """Operation may be slow.""" + + pass + + def _compile_tags(osmium_feature): """Return tag name value pairs. @@ -532,6 +541,9 @@ class FindTags: Tags found for OSM relation features. __area_tags : dict Tags found for OSM area features. + __LARGE_FILE_THRESH : int + Size threshold in bytes on which to raise a PerformanceWarning when + osm file on disk exceeds. """ @@ -541,6 +553,16 @@ def __init__( _is_expected_filetype( osm_pth, "osm_pth", check_existing=True, exp_ext=".pbf" ) + self.__LARGE_FILE_THRESH = 50000 # 50 KB + # implement performance warning on large OSM files. + osm_size = os.path.getsize(osm_pth) + if osm_size > self.__LARGE_FILE_THRESH: + warnings.warn( + f"PBF file is {osm_size} bytes. Tag operations are expensive." + " Consider filtering the pbf file smaller than" + f" {self.__LARGE_FILE_THRESH} bytes", + PerformanceWarning, + ) tags = tag_collator() classnm = tags.__class__.__name__ if classnm != "_TagHandler": @@ -614,6 +636,9 @@ class FindLocations: Locations of nodes. __way_node_locs : dict Locations of nodes that belong to a way. + _osm_pth : Union[Path, str] + Path to osm file on disk. Used for method plot_ids() when include_tags + is True. """ @@ -630,6 +655,7 @@ def __init__( self.__node_locs = locs.node_locs self.__way_node_locs = locs.way_node_locs self.found_locs = dict() + self._osm_pth = osm_pth def _check_is_implemented(self, user_feature: str, param_nm: str) -> None: """If the requested feature is not node or way, raise.""" @@ -678,11 +704,123 @@ def check_locs_for_ids(self, ids: list, feature_type: str) -> dict: ) return self.found_locs + def _merge_dicts_retain_dupe_keys( + self, dict1: dict, dict2: dict, prepend_pattern: str = "parent_" + ) -> dict: + """Squish 2 dictionaries while retaining any duplicated keys. + + Update dict1 with key:value pairs from dict2. If duplicated keys are + found in dict2, prepend the key with prepend_pattern. + + Parameters + ---------- + dict1 : dict + Dictionary of (child or node) tags. + dict2 : dict + Dictionary of (parent) tags. + prepend_pattern : str + A string to prepend any duplicated keys in dict_2 with. + + Returns + ------- + dict + A merged dictionary, retaining key:value pairs from both. + + """ + tags_out = {} + for d in [dict1, dict2]: + if not isinstance(d, dict): + raise TypeError(f"Expected dict but found {type(d)}: {d}") + for id_, tags in dict1.items(): # child_tags is nested + # find duplicated keys and prepend parent keys + if dupes := set(tags.keys()).intersection(dict2.keys()): + for key in dupes: + dict2[f"{prepend_pattern}{key}"] = dict2.pop(key) + # merge parent and child tag collections + tags_out[id_] = tags | dict2 + return tags_out + + def _add_tag_context_to_coord_gdf( # noqa: C901 + self, ids: list, feature_type: str, tooltip_nm: str + ) -> gpd.GeoDataFrame: + """Add a column of tooltips to the coord_gdf attribute. + + Handles node and way features separately. + + Parameters + ---------- + ids : list + A list of IDs. + feature_type : str + "way" or "node". + tooltip_nm : str + Name of the column to use for the tooltips. + + Returns + ------- + None + Updates `coord_gdf` attribute. + + """ + mapping = {} + parent_tags = self.tagfinder.check_tags_for_ids(ids, feature_type) + self.coord_gdf[tooltip_nm] = self.coord_gdf.index.to_list() + if feature_type == "way": + parent_child_mapping = self.coord_gdf.index + # Now we have child IDs, we need to run them through FindTags + child_tags = self.tagfinder.check_tags_for_ids( + [i[-1] for i in parent_child_mapping], feature_type="node" + ) + # add in the parent tag ID to all child tags + for k, v in child_tags.items(): + for t in parent_child_mapping.to_flat_index(): + if k == t[-1]: + v["parent_id"] = t[0] + # merge the parent way metadata dictionary with the child + # metadata dict + all_tags = parent_child_mapping.to_series().to_dict() + for k, v in parent_tags.items(): + # iterate over only the children for each parent node + for id_ in [i for i in parent_child_mapping if i[0] == k]: + all_tags[id_] = self._merge_dicts_retain_dupe_keys( + {id_[-1]: child_tags[id_[-1]]}, v + ) + # add combined tags as custom tooltips to coord_gdf. Use map + # method to avoid lexsort performance warning + for _, v in all_tags.items(): + for k, val in v.items(): + tooltips = [ + f"{tag}: {val_}
" + for tag, val_ in val.items() + ] + mapping[(val["parent_id"], k)] = "".join(tooltips) + + elif feature_type == "node": + for k, val in self.tagfinder.found_tags.items(): + tooltips = [ + f"{tag}: {val_}
" for tag, val_ in val.items() + ] + mapping[k] = "".join(tooltips) + + self.coord_gdf[tooltip_nm] = self.coord_gdf[tooltip_nm].map(mapping) + return None + def plot_ids( self, ids: list, feature_type: str, crs: Union[str, int] = "epsg:4326", + include_tags: bool = False, + tooltip_nm: str = "custom_tooltip", + tooltip_kwds: dict = {"labels": False}, + tiles: str = "CartoDB positron", + style_kwds: dict = { + "color": "#3f5277", + "fill": True, + "fillOpacity": 0.3, + "fillColor": "#3f5277", + "weight": 4, + }, ) -> folium.Map: """Plot coordinates for nodes or node members of a way. @@ -698,6 +836,26 @@ def plot_ids( Whether the type of OSM feature to plot is node or way. crs : Union[str, int], optional The projection of the spatial features, by default "epsg:4326" + include_tags : bool + Should tag metadata be included in the map tooltips, by default + False + tooltip_nm : str + Name to use for tooltip column in coord_gdf attribute, by default + "custom_tooltip" + tooltip_kwds : dict + Additional tooltip styling arguments to pass to gpd explore(), by + default {"labels": False} + tiles : Union[str, xyzservices.TileProvider] + Basemap provider tiles to use, by default "CartoDB positron" + style_kwds : dict + Additional map styling arguments to pass to gpd explore(), by + default { + "color": "#3f5277", + "fill": True, + "fillOpacity": 0.3, + "fillColor": "#3f5277", + "weight": 4, + } Returns ------- @@ -719,6 +877,11 @@ def plot_ids( _type_defence(ids, "ids", list) _type_defence(feature_type, "feature_type", str) _type_defence(crs, "crs", (str, int)) + _type_defence(include_tags, "include_tags", bool) + _type_defence(tooltip_nm, "tooltip_nm", str) + _type_defence(tooltip_kwds, "tooltip_kwds", dict) + _type_defence(tiles, "tiles", (str, TileProvider)) + _type_defence(style_kwds, "style_kwds", dict) self._check_is_implemented( user_feature=feature_type, param_nm="feature_type" ) @@ -728,4 +891,19 @@ def plot_ids( feature_type=feature_type, crs=crs, ) - return self.coord_gdf.explore() + if not include_tags: + imap = self.coord_gdf.explore(tiles=tiles, style_kwds=style_kwds) + else: + # retrieve tags for IDs and add them to self.coord_gdf + self.tagfinder = FindTags(self._osm_pth) + self._add_tag_context_to_coord_gdf( + ids, feature_type, tooltip_nm=tooltip_nm + ) + imap = self.coord_gdf.explore( + tooltip=tooltip_nm, + tooltip_kwds=tooltip_kwds, + tiles=tiles, + style_kwds=style_kwds, + ) + + return imap diff --git a/tests/osm/test_validate_osm.py b/tests/osm/test_validate_osm.py index 9cf97bea..10a96fe3 100644 --- a/tests/osm/test_validate_osm.py +++ b/tests/osm/test_validate_osm.py @@ -11,6 +11,7 @@ FindIds, FindLocations, FindTags, + PerformanceWarning, _convert_osm_dict_to_gdf, _filter_target_dict_with_list, ) @@ -321,6 +322,19 @@ def test_find_locations_init(self, _tiny_osm_locs): way_len == 2 ), f"Expected way with length 2, instead found {way_len}" + def test__merge_dicts_retain_dupe_keys_raises(self, _tiny_osm_locs): + """Test internal raises TypeError.""" + with pytest.raises( + TypeError, + match=re.escape( + "Expected dict but found : ['not a key', 2]" + ), + ): + _tiny_osm_locs._merge_dicts_retain_dupe_keys( + dict1={"some_key": 1}, + dict2=["not a key", 2], + ) + def test_check_locs_for_ids(self, _tiny_osm_locs, _tiny_osm_ids): """Assert check_locs_for_ids.""" ids = _tiny_osm_ids @@ -368,8 +382,42 @@ def test_plot_ids_on_pass(self, _tiny_osm_locs, _tiny_osm_ids): ids=ids._FindIds__node_ids[0:1], feature_type="node" ) assert isinstance(plt, folium.Map) + plt = locs.plot_ids( + ids=ids._FindIds__node_ids[0:1], + feature_type="node", + include_tags=True, + ) + assert isinstance(plt, folium.Map) + # check the tag column is as expected - for nodes, this example should + # be empty, nodes often contain no tags, but not always + pd.testing.assert_series_equal( + locs.coord_gdf["custom_tooltip"], + pd.Series([""], index=[7727955], name="custom_tooltip"), + ) + assert locs.coord_gdf["custom_tooltip"].values == [""] plt = locs.plot_ids(ids=ids._FindIds__way_ids[0:1], feature_type="way") assert isinstance(plt, folium.Map) + plt = locs.plot_ids( + ids=ids._FindIds__way_ids[0:1], + feature_type="way", + include_tags=True, + ) + # check the tag column is as expected - for ways, these should always + # include at least the parent_id tag. + pd.testing.assert_series_equal( + locs.coord_gdf["custom_tooltip"], + pd.Series( + [ + "crossing: marked
highway: crossing
tactile_paving: yes
parent_id: 4811009
lanes: 2
name: Kingsway
oneway: yes
postal_code: NP20
ref: A4042
parent_highway: primary
", # noqa E501 + "parent_id: 4811009
lanes: 2
name: Kingsway
oneway: yes
postal_code: NP20
ref: A4042
parent_highway: primary
", # noqa E501 + ], + index=pd.MultiIndex.from_tuples( + [(4811009, 7447008812), (4811009, 443158788)], + names=["parent_id", "member_id"], + ), + name="custom_tooltip", + ), + ) def test_plot_ids_not_implemented(self, _tiny_osm_locs): """Assert asking for relation or area riases not implemented error.""" @@ -419,6 +467,18 @@ def test_find_tags_init(self, _tiny_osm_tags): ] _class_atttribute_assertions(tags, expected_attrs, expected_methods) + @pytest.mark.runexpensive + def test_find_tags_init_warning(self): + """Test that large OSM files trigger a performance warning. + + execution duration c.80 seconds. + """ + with pytest.warns( + PerformanceWarning, + match=".*Consider filtering the pbf file smaller than 50000 bytes", + ): + FindTags(here("tests/data/newport-2023-06-13.osm.pbf")) + def test_find_tags_check_tags_for_ids(self, _tiny_osm_tags, _tiny_osm_ids): """Test FindTags.check_tags_for_ids().""" ids = _tiny_osm_ids