Skip to content

Commit

Permalink
287 improve osm maps with tag metadata (#294)
Browse files Browse the repository at this point in the history
* feat: OSM filtering operations

* feat: Visualise OSM Way feature

* feat: Finalised version of OSM tutorial

* feat: Updated dependencies in GTFS tutorial

* feat: Way feature tag metadata added to folium maps

* feat: Add node context to folium map

* test: Folium maps produced when specify plot_ids with include_tags=True

* refactor: Move utility to class method

* refactor: Remove surplus dict copy statement

* docs: Update osm tutorial with example adding tag metadata to folium map

* test: Tooltip has been assembled as needed

* test: Internal method raises type error

* refactor: Need to import PerformanceWarning to test module

* test: runexpensive test for raise of perf warning on FindTag init with large osm fixture

* chore: Pin numpy to avoid numpy.dtype size change error with numpy=2.0.0

* docs: PR docs suggestion

* refactor: PR make large_file_thresh more obvious as an internal attribute

* refactor: PR inherit from Warning not UserWarning

* feat: PR type defences for additional params

* chore: Consistency in req file
  • Loading branch information
r-leyshon authored Jun 25, 2024
1 parent 3fa154e commit cef299b
Show file tree
Hide file tree
Showing 4 changed files with 251 additions and 4 deletions.
10 changes: 9 additions & 1 deletion docs/tutorials/osm/index.qmd
Original file line number Diff line number Diff line change
Expand Up @@ -128,7 +128,7 @@ xmin, ymin, xmax, ymax format. Call the list `BBOX_LIST`.
BBOX_LIST = [<INSERT_VALUES_HERE>]
```

### Exercise
### Solution

```{python}
BBOX_LIST = [-3.002175, 51.587035, -2.994271, 51.59095]
Expand Down Expand Up @@ -353,6 +353,14 @@ To read more on `osmosis` filtering strategies, refer to the `completeWays` and
`completeRelations` flag descriptions in the
[Osmosis detailed usage documentation](https://wiki.openstreetmap.org/wiki/Osmosis/Detailed_Usage_0.48).


Note that additional metadata can be added to the map by setting `include_tags=True`. Adding this rich contextual data to the map can be useful but is also computationally expensive. This operation should be avoided for large osm files, for example anything over 500 KB.

```{python}
loc_finder.plot_ids(id_finder.id_dict["way_ids"], feature_type="way", include_tags=True)
```

## Conclusion

Congratulations, you have successfully completed this tutorial on OpenStreetMap
Expand Down
5 changes: 3 additions & 2 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -28,11 +28,12 @@ seaborn
haversine
pretty_html_table
kaleido
numpy>=1.25.0 # test suite will fail if user installed lower than this
numpy==1.26.4 # ERROR - ValueError: numpy.dtype size changed, may indicate binary incompatibility. Expected 96 from C header, got 88 from PyObject
dask[dataframe]
pyarrow >= 14.0.1 # 14.0.0 has security vulnerability
pyarrow>=14.0.1 # 14.0.0 has security vulnerability
osmium # has dependencies on `cmake` and `boost` which require brew install
tqdm
quartodoc
jupyterlab
xyzservices
-e .
180 changes: 179 additions & 1 deletion src/transport_performance/osm/validate_osm.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,8 @@
* Find coordinates for node or way features
* Plot the coordinates of a given list of node or way IDs
"""
import os
import warnings
from pathlib import Path
from typing import Union

Expand All @@ -26,6 +28,7 @@
import osmium
import pandas as pd
from shapely import Point
from xyzservices import TileProvider

from transport_performance.utils.defence import (
_check_item_in_iter,
Expand All @@ -37,6 +40,12 @@
# ---------utilities-----------


class PerformanceWarning(Warning):
"""Operation may be slow."""

pass


def _compile_tags(osmium_feature):
"""Return tag name value pairs.
Expand Down Expand Up @@ -532,6 +541,9 @@ class FindTags:
Tags found for OSM relation features.
__area_tags : dict
Tags found for OSM area features.
__LARGE_FILE_THRESH : int
Size threshold in bytes on which to raise a PerformanceWarning when
osm file on disk exceeds.
"""

Expand All @@ -541,6 +553,16 @@ def __init__(
_is_expected_filetype(
osm_pth, "osm_pth", check_existing=True, exp_ext=".pbf"
)
self.__LARGE_FILE_THRESH = 50000 # 50 KB
# implement performance warning on large OSM files.
osm_size = os.path.getsize(osm_pth)
if osm_size > self.__LARGE_FILE_THRESH:
warnings.warn(
f"PBF file is {osm_size} bytes. Tag operations are expensive."
" Consider filtering the pbf file smaller than"
f" {self.__LARGE_FILE_THRESH} bytes",
PerformanceWarning,
)
tags = tag_collator()
classnm = tags.__class__.__name__
if classnm != "_TagHandler":
Expand Down Expand Up @@ -614,6 +636,9 @@ class FindLocations:
Locations of nodes.
__way_node_locs : dict
Locations of nodes that belong to a way.
_osm_pth : Union[Path, str]
Path to osm file on disk. Used for method plot_ids() when include_tags
is True.
"""

Expand All @@ -630,6 +655,7 @@ def __init__(
self.__node_locs = locs.node_locs
self.__way_node_locs = locs.way_node_locs
self.found_locs = dict()
self._osm_pth = osm_pth

def _check_is_implemented(self, user_feature: str, param_nm: str) -> None:
"""If the requested feature is not node or way, raise."""
Expand Down Expand Up @@ -678,11 +704,123 @@ def check_locs_for_ids(self, ids: list, feature_type: str) -> dict:
)
return self.found_locs

def _merge_dicts_retain_dupe_keys(
self, dict1: dict, dict2: dict, prepend_pattern: str = "parent_"
) -> dict:
"""Squish 2 dictionaries while retaining any duplicated keys.
Update dict1 with key:value pairs from dict2. If duplicated keys are
found in dict2, prepend the key with prepend_pattern.
Parameters
----------
dict1 : dict
Dictionary of (child or node) tags.
dict2 : dict
Dictionary of (parent) tags.
prepend_pattern : str
A string to prepend any duplicated keys in dict_2 with.
Returns
-------
dict
A merged dictionary, retaining key:value pairs from both.
"""
tags_out = {}
for d in [dict1, dict2]:
if not isinstance(d, dict):
raise TypeError(f"Expected dict but found {type(d)}: {d}")
for id_, tags in dict1.items(): # child_tags is nested
# find duplicated keys and prepend parent keys
if dupes := set(tags.keys()).intersection(dict2.keys()):
for key in dupes:
dict2[f"{prepend_pattern}{key}"] = dict2.pop(key)
# merge parent and child tag collections
tags_out[id_] = tags | dict2
return tags_out

def _add_tag_context_to_coord_gdf( # noqa: C901
self, ids: list, feature_type: str, tooltip_nm: str
) -> gpd.GeoDataFrame:
"""Add a column of tooltips to the coord_gdf attribute.
Handles node and way features separately.
Parameters
----------
ids : list
A list of IDs.
feature_type : str
"way" or "node".
tooltip_nm : str
Name of the column to use for the tooltips.
Returns
-------
None
Updates `coord_gdf` attribute.
"""
mapping = {}
parent_tags = self.tagfinder.check_tags_for_ids(ids, feature_type)
self.coord_gdf[tooltip_nm] = self.coord_gdf.index.to_list()
if feature_type == "way":
parent_child_mapping = self.coord_gdf.index
# Now we have child IDs, we need to run them through FindTags
child_tags = self.tagfinder.check_tags_for_ids(
[i[-1] for i in parent_child_mapping], feature_type="node"
)
# add in the parent tag ID to all child tags
for k, v in child_tags.items():
for t in parent_child_mapping.to_flat_index():
if k == t[-1]:
v["parent_id"] = t[0]
# merge the parent way metadata dictionary with the child
# metadata dict
all_tags = parent_child_mapping.to_series().to_dict()
for k, v in parent_tags.items():
# iterate over only the children for each parent node
for id_ in [i for i in parent_child_mapping if i[0] == k]:
all_tags[id_] = self._merge_dicts_retain_dupe_keys(
{id_[-1]: child_tags[id_[-1]]}, v
)
# add combined tags as custom tooltips to coord_gdf. Use map
# method to avoid lexsort performance warning
for _, v in all_tags.items():
for k, val in v.items():
tooltips = [
f"<b>{tag}:</b> {val_}<br>"
for tag, val_ in val.items()
]
mapping[(val["parent_id"], k)] = "".join(tooltips)

elif feature_type == "node":
for k, val in self.tagfinder.found_tags.items():
tooltips = [
f"<b>{tag}:</b> {val_}<br>" for tag, val_ in val.items()
]
mapping[k] = "".join(tooltips)

self.coord_gdf[tooltip_nm] = self.coord_gdf[tooltip_nm].map(mapping)
return None

def plot_ids(
self,
ids: list,
feature_type: str,
crs: Union[str, int] = "epsg:4326",
include_tags: bool = False,
tooltip_nm: str = "custom_tooltip",
tooltip_kwds: dict = {"labels": False},
tiles: str = "CartoDB positron",
style_kwds: dict = {
"color": "#3f5277",
"fill": True,
"fillOpacity": 0.3,
"fillColor": "#3f5277",
"weight": 4,
},
) -> folium.Map:
"""Plot coordinates for nodes or node members of a way.
Expand All @@ -698,6 +836,26 @@ def plot_ids(
Whether the type of OSM feature to plot is node or way.
crs : Union[str, int], optional
The projection of the spatial features, by default "epsg:4326"
include_tags : bool
Should tag metadata be included in the map tooltips, by default
False
tooltip_nm : str
Name to use for tooltip column in coord_gdf attribute, by default
"custom_tooltip"
tooltip_kwds : dict
Additional tooltip styling arguments to pass to gpd explore(), by
default {"labels": False}
tiles : Union[str, xyzservices.TileProvider]
Basemap provider tiles to use, by default "CartoDB positron"
style_kwds : dict
Additional map styling arguments to pass to gpd explore(), by
default {
"color": "#3f5277",
"fill": True,
"fillOpacity": 0.3,
"fillColor": "#3f5277",
"weight": 4,
}
Returns
-------
Expand All @@ -719,6 +877,11 @@ def plot_ids(
_type_defence(ids, "ids", list)
_type_defence(feature_type, "feature_type", str)
_type_defence(crs, "crs", (str, int))
_type_defence(include_tags, "include_tags", bool)
_type_defence(tooltip_nm, "tooltip_nm", str)
_type_defence(tooltip_kwds, "tooltip_kwds", dict)
_type_defence(tiles, "tiles", (str, TileProvider))
_type_defence(style_kwds, "style_kwds", dict)
self._check_is_implemented(
user_feature=feature_type, param_nm="feature_type"
)
Expand All @@ -728,4 +891,19 @@ def plot_ids(
feature_type=feature_type,
crs=crs,
)
return self.coord_gdf.explore()
if not include_tags:
imap = self.coord_gdf.explore(tiles=tiles, style_kwds=style_kwds)
else:
# retrieve tags for IDs and add them to self.coord_gdf
self.tagfinder = FindTags(self._osm_pth)
self._add_tag_context_to_coord_gdf(
ids, feature_type, tooltip_nm=tooltip_nm
)
imap = self.coord_gdf.explore(
tooltip=tooltip_nm,
tooltip_kwds=tooltip_kwds,
tiles=tiles,
style_kwds=style_kwds,
)

return imap
60 changes: 60 additions & 0 deletions tests/osm/test_validate_osm.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
FindIds,
FindLocations,
FindTags,
PerformanceWarning,
_convert_osm_dict_to_gdf,
_filter_target_dict_with_list,
)
Expand Down Expand Up @@ -321,6 +322,19 @@ def test_find_locations_init(self, _tiny_osm_locs):
way_len == 2
), f"Expected way with length 2, instead found {way_len}"

def test__merge_dicts_retain_dupe_keys_raises(self, _tiny_osm_locs):
"""Test internal raises TypeError."""
with pytest.raises(
TypeError,
match=re.escape(
"Expected dict but found <class 'list'>: ['not a key', 2]"
),
):
_tiny_osm_locs._merge_dicts_retain_dupe_keys(
dict1={"some_key": 1},
dict2=["not a key", 2],
)

def test_check_locs_for_ids(self, _tiny_osm_locs, _tiny_osm_ids):
"""Assert check_locs_for_ids."""
ids = _tiny_osm_ids
Expand Down Expand Up @@ -368,8 +382,42 @@ def test_plot_ids_on_pass(self, _tiny_osm_locs, _tiny_osm_ids):
ids=ids._FindIds__node_ids[0:1], feature_type="node"
)
assert isinstance(plt, folium.Map)
plt = locs.plot_ids(
ids=ids._FindIds__node_ids[0:1],
feature_type="node",
include_tags=True,
)
assert isinstance(plt, folium.Map)
# check the tag column is as expected - for nodes, this example should
# be empty, nodes often contain no tags, but not always
pd.testing.assert_series_equal(
locs.coord_gdf["custom_tooltip"],
pd.Series([""], index=[7727955], name="custom_tooltip"),
)
assert locs.coord_gdf["custom_tooltip"].values == [""]
plt = locs.plot_ids(ids=ids._FindIds__way_ids[0:1], feature_type="way")
assert isinstance(plt, folium.Map)
plt = locs.plot_ids(
ids=ids._FindIds__way_ids[0:1],
feature_type="way",
include_tags=True,
)
# check the tag column is as expected - for ways, these should always
# include at least the parent_id tag.
pd.testing.assert_series_equal(
locs.coord_gdf["custom_tooltip"],
pd.Series(
[
"<b>crossing:</b> marked<br><b>highway:</b> crossing<br><b>tactile_paving:</b> yes<br><b>parent_id:</b> 4811009<br><b>lanes:</b> 2<br><b>name:</b> Kingsway<br><b>oneway:</b> yes<br><b>postal_code:</b> NP20<br><b>ref:</b> A4042<br><b>parent_highway:</b> primary<br>", # noqa E501
"<b>parent_id:</b> 4811009<br><b>lanes:</b> 2<br><b>name:</b> Kingsway<br><b>oneway:</b> yes<br><b>postal_code:</b> NP20<br><b>ref:</b> A4042<br><b>parent_highway:</b> primary<br>", # noqa E501
],
index=pd.MultiIndex.from_tuples(
[(4811009, 7447008812), (4811009, 443158788)],
names=["parent_id", "member_id"],
),
name="custom_tooltip",
),
)

def test_plot_ids_not_implemented(self, _tiny_osm_locs):
"""Assert asking for relation or area riases not implemented error."""
Expand Down Expand Up @@ -419,6 +467,18 @@ def test_find_tags_init(self, _tiny_osm_tags):
]
_class_atttribute_assertions(tags, expected_attrs, expected_methods)

@pytest.mark.runexpensive
def test_find_tags_init_warning(self):
"""Test that large OSM files trigger a performance warning.
execution duration c.80 seconds.
"""
with pytest.warns(
PerformanceWarning,
match=".*Consider filtering the pbf file smaller than 50000 bytes",
):
FindTags(here("tests/data/newport-2023-06-13.osm.pbf"))

def test_find_tags_check_tags_for_ids(self, _tiny_osm_tags, _tiny_osm_ids):
"""Test FindTags.check_tags_for_ids()."""
ids = _tiny_osm_ids
Expand Down

0 comments on commit cef299b

Please sign in to comment.