Avoid pickling shapely object in Exposures.write_hdf5 (#1051)

emanuel-schmid · peanutfun · web-flow · commit 1a6df8ef81b2 · 2025-05-16T12:59:44.000+02:00
* refactor Exposures.write_hdf5 and .from_hdf5: use wkb instead of pickle for geometry serialization

* refactor Exposures.write_hdf5

* change of plan: just pickle geometries in wkb format

* Update climada/entity/exposures/base.py

Co-authored-by: Lukas Riedel &lt;34276446+peanutfun@users.noreply.github.com&gt;

* abandon shapely pickling

* simplify wkb columns collection

* simplify wkb conversion

* cosmetics

---------

Co-authored-by: Lukas Riedel &lt;34276446+peanutfun@users.noreply.github.com&gt;
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -15,18 +15,23 @@ Removed:
 - `pandas-datareader`
 
 ### Added
+
 - Added instructions to install Climada petals on Euler cluster in `doc.guide.Guide_Euler.ipynb` [#1029](https://github.com/CLIMADA-project/climada_python/pull/1029)
 
 ### Changed
+
 - `Hazard.local_exceedance_intensity`, `Hazard.local_return_period` and `Impact.local_exceedance_impact`, `Impact.local_return_period`, using the `climada.util.interpolation` module: New default (no binning), binning on decimals, and faster implementation [#1012](https://github.com/CLIMADA-project/climada_python/pull/1012)
 - World Bank indicator data is now downloaded directly from their API via the function `download_world_bank_indicator`, instead of relying on the `pandas-datareader` package [#1033](https://github.com/CLIMADA-project/climada_python/pull/1033)
+- `Exposures.write_hdf5` pickles geometry data in WKB format, which is faster and more sustainable. [#1051](https://github.com/CLIMADA-project/climada_python/pull/1051)
 
 ### Fixed
+
 - NaN plotting issues in `geo_im_from_array`[#1038](https://github.com/CLIMADA-project/climada_python/pull/1038)
 
 ### Deprecated
 
 ### Removed
+
 - `climada.util.interpolation.round_to_sig_digits` [#1012](https://github.com/CLIMADA-project/climada_python/pull/1012)
 
 ## 6.0.1
diff --git a/climada/entity/exposures/base.py b/climada/entity/exposures/base.py
@@ -29,6 +29,7 @@
 
 import cartopy.crs as ccrs
 import contextily as ctx
+import geopandas as gpd
 import matplotlib.pyplot as plt
 import numpy as np
 import pandas as pd
@@ -1131,10 +1132,8 @@ def write_hdf5(self, file_name):
         """
         LOGGER.info("Writing %s", file_name)
         store = pd.HDFStore(file_name, mode="w")
-        pandas_df = pd.DataFrame(self.gdf)
-        for col in pandas_df.columns:
-            if str(pandas_df[col].dtype) == "geometry":
-                pandas_df[col] = np.asarray(self.gdf[col])
+        geocols = self.data.columns[self.data.dtypes == "geometry"].to_list()
+        pandas_df = self.data.to_wkb()
 
         # Avoid pandas PerformanceWarning when writing HDF5 data
         with warnings.catch_warnings():
@@ -1146,6 +1145,7 @@ def write_hdf5(self, file_name):
         for var in type(self)._metadata:
             var_meta[var] = getattr(self, var)
         var_meta["crs"] = self.crs
+        var_meta["wkb_columns"] = geocols
         store.get_storer("exposures").attrs.metadata = var_meta
 
         store.close()
@@ -1184,7 +1184,15 @@ def from_hdf5(cls, file_name):
             crs = metadata.get("crs", metadata.get("_crs"))
             if crs is None and metadata.get("meta"):
                 crs = metadata["meta"].get("crs")
-            exp = cls(store["exposures"], crs=crs)
+            data = pd.DataFrame(store["exposures"])
+
+            wkb_columns = (
+                metadata.pop("wkb_columns") if "wkb_columns" in metadata else []
+            )
+            for col in wkb_columns:
+                data[col] = gpd.GeoSeries.from_wkb(data[col])
+
+            exp = cls(data, crs=crs)
             for key, val in metadata.items():
                 if key in type(exp)._metadata:  # pylint: disable=protected-access
                     setattr(exp, key, val)
diff --git a/climada/entity/exposures/test/test_base.py b/climada/entity/exposures/test/test_base.py
@@ -378,11 +378,14 @@ def test_read_template_pass(self):
 
     def test_io_hdf5_pass(self):
         """write and read hdf5"""
-        exp_df = Exposures(pd.read_excel(ENT_TEMPLATE_XLS), crs="epsg:32632")
-        exp_df.check()
+        exp = Exposures(pd.read_excel(ENT_TEMPLATE_XLS), crs="epsg:32632")
+
         # set metadata
-        exp_df.ref_year = 2020
-        exp_df.value_unit = "XSD"
+        exp.ref_year = 2020
+        exp.value_unit = "XSD"
+
+        # add another geometry column
+        exp.data["geocol2"] = exp.data.geometry.copy(deep=True)
 
         file_name = DATA_DIR.joinpath("test_hdf5_exp.h5")
 
@@ -392,46 +395,51 @@ def test_io_hdf5_pass(self):
 
         with warnings.catch_warnings():
             warnings.simplefilter("error", category=pd.errors.PerformanceWarning)
-            exp_df.write_hdf5(file_name)
+            exp.write_hdf5(file_name=file_name)
 
         exp_read = Exposures.from_hdf5(file_name)
 
-        self.assertEqual(exp_df.ref_year, exp_read.ref_year)
-        self.assertEqual(exp_df.value_unit, exp_read.value_unit)
-        self.assertEqual(exp_df.description, exp_read.description)
-        np.testing.assert_array_equal(exp_df.latitude, exp_read.latitude)
-        np.testing.assert_array_equal(exp_df.longitude, exp_read.longitude)
-        np.testing.assert_array_equal(exp_df.value, exp_read.value)
+        self.assertEqual(exp.ref_year, exp_read.ref_year)
+        self.assertEqual(exp.value_unit, exp_read.value_unit)
+        self.assertEqual(exp.description, exp_read.description)
+        np.testing.assert_array_equal(exp.latitude, exp_read.latitude)
+        np.testing.assert_array_equal(exp.longitude, exp_read.longitude)
+        np.testing.assert_array_equal(exp.value, exp_read.value)
         np.testing.assert_array_equal(
-            exp_df.data["deductible"].values, exp_read.data["deductible"].values
+            exp.data["deductible"].values, exp_read.data["deductible"].values
         )
         np.testing.assert_array_equal(
-            exp_df.data["cover"].values, exp_read.data["cover"].values
+            exp.data["cover"].values, exp_read.data["cover"].values
         )
         np.testing.assert_array_equal(
-            exp_df.data["region_id"].values, exp_read.data["region_id"].values
+            exp.data["region_id"].values, exp_read.data["region_id"].values
         )
         np.testing.assert_array_equal(
-            exp_df.data["category_id"].values, exp_read.data["category_id"].values
+            exp.data["category_id"].values, exp_read.data["category_id"].values
         )
         np.testing.assert_array_equal(
-            exp_df.data["impf_TC"].values, exp_read.data["impf_TC"].values
+            exp.data["impf_TC"].values, exp_read.data["impf_TC"].values
         )
         np.testing.assert_array_equal(
-            exp_df.data["centr_TC"].values, exp_read.data["centr_TC"].values
+            exp.data["centr_TC"].values, exp_read.data["centr_TC"].values
         )
         np.testing.assert_array_equal(
-            exp_df.data["impf_FL"].values, exp_read.data["impf_FL"].values
+            exp.data["impf_FL"].values, exp_read.data["impf_FL"].values
         )
         np.testing.assert_array_equal(
-            exp_df.data["centr_FL"].values, exp_read.data["centr_FL"].values
+            exp.data["centr_FL"].values, exp_read.data["centr_FL"].values
         )
 
         self.assertTrue(
-            u_coord.equal_crs(exp_df.crs, exp_read.crs),
-            f"{exp_df.crs} and {exp_read.crs} are different",
+            u_coord.equal_crs(exp.crs, exp_read.crs),
+            f"{exp.crs} and {exp_read.crs} are different",
+        )
+        self.assertTrue(u_coord.equal_crs(exp.data.crs, exp_read.data.crs))
+
+        self.assertTrue(exp_read.data["geocol2"].dtype == "geometry")
+        np.testing.assert_array_equal(
+            exp.data["geocol2"].geometry, exp_read.data["geocol2"].values
         )
-        self.assertTrue(u_coord.equal_crs(exp_df.gdf.crs, exp_read.gdf.crs))
 
 
 class TestAddSea(unittest.TestCase):