Skip to content

Avoid pickling shapely object in Exposures.write_hdf5 #1051

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 9 commits into from
May 16, 2025
5 changes: 5 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -15,18 +15,23 @@ Removed:
- `pandas-datareader`

### Added

- Added instructions to install Climada petals on Euler cluster in `doc.guide.Guide_Euler.ipynb` [#1029](https://github.com/CLIMADA-project/climada_python/pull/1029)

### Changed

- `Hazard.local_exceedance_intensity`, `Hazard.local_return_period` and `Impact.local_exceedance_impact`, `Impact.local_return_period`, using the `climada.util.interpolation` module: New default (no binning), binning on decimals, and faster implementation [#1012](https://github.com/CLIMADA-project/climada_python/pull/1012)
- World Bank indicator data is now downloaded directly from their API via the function `download_world_bank_indicator`, instead of relying on the `pandas-datareader` package [#1033](https://github.com/CLIMADA-project/climada_python/pull/1033)
- `Exposures.write_hdf5` pickles geometry data in WKB format, which is faster and more sustainable. [#1051](https://github.com/CLIMADA-project/climada_python/pull/1051)

### Fixed

- NaN plotting issues in `geo_im_from_array`[#1038](https://github.com/CLIMADA-project/climada_python/pull/1038)

### Deprecated

### Removed

- `climada.util.interpolation.round_to_sig_digits` [#1012](https://github.com/CLIMADA-project/climada_python/pull/1012)

## 6.0.1
Expand Down
18 changes: 13 additions & 5 deletions climada/entity/exposures/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@

import cartopy.crs as ccrs
import contextily as ctx
import geopandas as gpd
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
Expand Down Expand Up @@ -1131,10 +1132,8 @@ def write_hdf5(self, file_name):
"""
LOGGER.info("Writing %s", file_name)
store = pd.HDFStore(file_name, mode="w")
pandas_df = pd.DataFrame(self.gdf)
for col in pandas_df.columns:
if str(pandas_df[col].dtype) == "geometry":
pandas_df[col] = np.asarray(self.gdf[col])
geocols = self.data.columns[self.data.dtypes == "geometry"].to_list()
pandas_df = self.data.to_wkb()

# Avoid pandas PerformanceWarning when writing HDF5 data
with warnings.catch_warnings():
Expand All @@ -1146,6 +1145,7 @@ def write_hdf5(self, file_name):
for var in type(self)._metadata:
var_meta[var] = getattr(self, var)
var_meta["crs"] = self.crs
var_meta["wkb_columns"] = geocols
store.get_storer("exposures").attrs.metadata = var_meta

store.close()
Expand Down Expand Up @@ -1184,7 +1184,15 @@ def from_hdf5(cls, file_name):
crs = metadata.get("crs", metadata.get("_crs"))
if crs is None and metadata.get("meta"):
crs = metadata["meta"].get("crs")
exp = cls(store["exposures"], crs=crs)
data = pd.DataFrame(store["exposures"])

wkb_columns = (
metadata.pop("wkb_columns") if "wkb_columns" in metadata else []
)
for col in wkb_columns:
data[col] = gpd.GeoSeries.from_wkb(data[col])

exp = cls(data, crs=crs)
for key, val in metadata.items():
if key in type(exp)._metadata: # pylint: disable=protected-access
setattr(exp, key, val)
Expand Down
52 changes: 30 additions & 22 deletions climada/entity/exposures/test/test_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -378,11 +378,14 @@ def test_read_template_pass(self):

def test_io_hdf5_pass(self):
"""write and read hdf5"""
exp_df = Exposures(pd.read_excel(ENT_TEMPLATE_XLS), crs="epsg:32632")
exp_df.check()
exp = Exposures(pd.read_excel(ENT_TEMPLATE_XLS), crs="epsg:32632")

# set metadata
exp_df.ref_year = 2020
exp_df.value_unit = "XSD"
exp.ref_year = 2020
exp.value_unit = "XSD"

# add another geometry column
exp.data["geocol2"] = exp.data.geometry.copy(deep=True)

file_name = DATA_DIR.joinpath("test_hdf5_exp.h5")

Expand All @@ -392,46 +395,51 @@ def test_io_hdf5_pass(self):

with warnings.catch_warnings():
warnings.simplefilter("error", category=pd.errors.PerformanceWarning)
exp_df.write_hdf5(file_name)
exp.write_hdf5(file_name=file_name)

exp_read = Exposures.from_hdf5(file_name)

self.assertEqual(exp_df.ref_year, exp_read.ref_year)
self.assertEqual(exp_df.value_unit, exp_read.value_unit)
self.assertEqual(exp_df.description, exp_read.description)
np.testing.assert_array_equal(exp_df.latitude, exp_read.latitude)
np.testing.assert_array_equal(exp_df.longitude, exp_read.longitude)
np.testing.assert_array_equal(exp_df.value, exp_read.value)
self.assertEqual(exp.ref_year, exp_read.ref_year)
self.assertEqual(exp.value_unit, exp_read.value_unit)
self.assertEqual(exp.description, exp_read.description)
np.testing.assert_array_equal(exp.latitude, exp_read.latitude)
np.testing.assert_array_equal(exp.longitude, exp_read.longitude)
np.testing.assert_array_equal(exp.value, exp_read.value)
np.testing.assert_array_equal(
exp_df.data["deductible"].values, exp_read.data["deductible"].values
exp.data["deductible"].values, exp_read.data["deductible"].values
)
np.testing.assert_array_equal(
exp_df.data["cover"].values, exp_read.data["cover"].values
exp.data["cover"].values, exp_read.data["cover"].values
)
np.testing.assert_array_equal(
exp_df.data["region_id"].values, exp_read.data["region_id"].values
exp.data["region_id"].values, exp_read.data["region_id"].values
)
np.testing.assert_array_equal(
exp_df.data["category_id"].values, exp_read.data["category_id"].values
exp.data["category_id"].values, exp_read.data["category_id"].values
)
np.testing.assert_array_equal(
exp_df.data["impf_TC"].values, exp_read.data["impf_TC"].values
exp.data["impf_TC"].values, exp_read.data["impf_TC"].values
)
np.testing.assert_array_equal(
exp_df.data["centr_TC"].values, exp_read.data["centr_TC"].values
exp.data["centr_TC"].values, exp_read.data["centr_TC"].values
)
np.testing.assert_array_equal(
exp_df.data["impf_FL"].values, exp_read.data["impf_FL"].values
exp.data["impf_FL"].values, exp_read.data["impf_FL"].values
)
np.testing.assert_array_equal(
exp_df.data["centr_FL"].values, exp_read.data["centr_FL"].values
exp.data["centr_FL"].values, exp_read.data["centr_FL"].values
)

self.assertTrue(
u_coord.equal_crs(exp_df.crs, exp_read.crs),
f"{exp_df.crs} and {exp_read.crs} are different",
u_coord.equal_crs(exp.crs, exp_read.crs),
f"{exp.crs} and {exp_read.crs} are different",
)
self.assertTrue(u_coord.equal_crs(exp.data.crs, exp_read.data.crs))

self.assertTrue(exp_read.data["geocol2"].dtype == "geometry")
np.testing.assert_array_equal(
exp.data["geocol2"].geometry, exp_read.data["geocol2"].values
)
self.assertTrue(u_coord.equal_crs(exp_df.gdf.crs, exp_read.gdf.crs))


class TestAddSea(unittest.TestCase):
Expand Down
Loading