From 297326eb1053c4ccc74bbf9e94e3cff04dba3348 Mon Sep 17 00:00:00 2001 From: emanuel-schmid Date: Wed, 16 Apr 2025 10:07:47 +0200 Subject: [PATCH 1/8] refactor Exposures.write_hdf5 and .from_hdf5: use wkb instead of pickle for geometry serialization --- climada/entity/exposures/base.py | 44 +++++++++++++++++++++++++++++--- 1 file changed, 40 insertions(+), 4 deletions(-) diff --git a/climada/entity/exposures/base.py b/climada/entity/exposures/base.py index f437d2d46..22889d5aa 100644 --- a/climada/entity/exposures/base.py +++ b/climada/entity/exposures/base.py @@ -37,6 +37,7 @@ from geopandas import GeoDataFrame, GeoSeries, points_from_xy from mpl_toolkits.axes_grid1 import make_axes_locatable from rasterio.warp import Resampling +from xarray import DataArray import climada.util.coordinates as u_coord import climada.util.hdf5_handler as u_hdf5 @@ -1121,20 +1122,30 @@ def plot_basemap( self.to_crs(crs_ori, inplace=True) return axis - def write_hdf5(self, file_name): + def write_hdf5(self, file_name, pickle_geometry=False): """Write data frame and metadata in hdf5 format Parameters ---------- file_name : str (path and) file name to write to. + pickle_geometry : bool + flag, indicating whether the "geometry" of the Exposures` `data` will be stored as + pickled shapely objects instead of wkb bytes. This is faster but less durable, because + pickled data may get unreadable for future shapely versions. + Default: False """ LOGGER.info("Writing %s", file_name) store = pd.HDFStore(file_name, mode="w") - pandas_df = pd.DataFrame(self.gdf) + pandas_df = pd.DataFrame(self.data) + wkb_data = {} for col in pandas_df.columns: if str(pandas_df[col].dtype) == "geometry": - pandas_df[col] = np.asarray(self.gdf[col]) + if pickle_geometry: + pandas_df[col] = np.asarray(self.data[col]) + else: + wkb_data[col] = to_wkb_store(self.geometry) + pandas_df.drop(columns=["geometry"]) # Avoid pandas PerformanceWarning when writing HDF5 data with warnings.catch_warnings(): @@ -1142,6 +1153,9 @@ def write_hdf5(self, file_name): # Write dataframe store.put("exposures", pandas_df) + if wkb_data: + store.put("wkb_data", wkb_data) + var_meta = {} for var in type(self)._metadata: var_meta[var] = getattr(self, var) @@ -1184,7 +1198,14 @@ def from_hdf5(cls, file_name): crs = metadata.get("crs", metadata.get("_crs")) if crs is None and metadata.get("meta"): crs = metadata["meta"].get("crs") - exp = cls(store["exposures"], crs=crs) + data = pd.DataFrame(store["exposures"]) + try: + wkb_data = store.get("wkb_data") + except KeyError: + wkb_data = {} + for col, val in wkb_data.items(): + data[col] = from_wkb_store(val) + exp = cls(data, crs=crs) for key, val in metadata.items(): if key in type(exp)._metadata: # pylint: disable=protected-access setattr(exp, key, val) @@ -1553,6 +1574,21 @@ def _read_mat_optional(exposures, data, var_names): pass +def to_wkb_store(geometry: np.array, store): + wkb_data = geometry.to_wkb().to_numpy() + import h5py + + wkb_dataset = h5py.Dataset(store) + + # Store WKB as variable-length byte arrays + dt = h5py.vlen_dtype(np.dtype("uint8")) + wkb_dataset.dtype = dt + for i, geom_bytes in enumerate(wkb_data): + wkb_dataset[i] = np.frombuffer(geom_bytes, dtype="uint8") + + return wkb_data + + def _read_mat_metadata(exposures, data, file_name, var_names): """Fill metadata in DataFrame object""" try: From 5e8e583c4ea69b9c121dfe8afe3cf5dc10d7f736 Mon Sep 17 00:00:00 2001 From: emanuel-schmid Date: Tue, 22 Apr 2025 16:53:25 +0200 Subject: [PATCH 2/8] refactor Exposures.write_hdf5 --- climada/entity/exposures/base.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/climada/entity/exposures/base.py b/climada/entity/exposures/base.py index 22889d5aa..69512c49d 100644 --- a/climada/entity/exposures/base.py +++ b/climada/entity/exposures/base.py @@ -1145,7 +1145,7 @@ def write_hdf5(self, file_name, pickle_geometry=False): pandas_df[col] = np.asarray(self.data[col]) else: wkb_data[col] = to_wkb_store(self.geometry) - pandas_df.drop(columns=["geometry"]) + pandas_df.drop(columns=[col], inplace=True) # Avoid pandas PerformanceWarning when writing HDF5 data with warnings.catch_warnings(): From ab18675045105ace837cd850e69ef4a72bcc0a51 Mon Sep 17 00:00:00 2001 From: emanuel-schmid Date: Wed, 30 Apr 2025 16:05:35 +0200 Subject: [PATCH 3/8] change of plan: just pickle geometries in wkb format --- CHANGELOG.md | 5 ++ climada/entity/exposures/base.py | 55 +++++-------- climada/entity/exposures/test/test_base.py | 89 +++++++++++----------- 3 files changed, 71 insertions(+), 78 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 42757783c..74780084e 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -15,18 +15,23 @@ Removed: - `pandas-datareader` ### Added + - Added instructions to install Climada petals on Euler cluster in `doc.guide.Guide_Euler.ipynb` [#1029](https://github.com/CLIMADA-project/climada_python/pull/1029) ### Changed + - `Hazard.local_exceedance_intensity`, `Hazard.local_return_period` and `Impact.local_exceedance_impact`, `Impact.local_return_period`, using the `climada.util.interpolation` module: New default (no binning), binning on decimals, and faster implementation [#1012](https://github.com/CLIMADA-project/climada_python/pull/1012) - World Bank indicator data is now downloaded directly from their API via the function `download_world_bank_indicator`, instead of relying on the `pandas-datareader` package [#1033](https://github.com/CLIMADA-project/climada_python/pull/1033) +- `Exposures.write_hdf5` pickles geometry data in WKB format by default, and not as `shapely` objects anymore. There is now a flag to keep the previous behavior. ### Fixed + - NaN plotting issues in `geo_im_from_array`[#1038](https://github.com/CLIMADA-project/climada_python/pull/1038) ### Deprecated ### Removed + - `climada.util.interpolation.round_to_sig_digits` [#1012](https://github.com/CLIMADA-project/climada_python/pull/1012) ## 6.0.1 diff --git a/climada/entity/exposures/base.py b/climada/entity/exposures/base.py index 69512c49d..fdcde8e6e 100644 --- a/climada/entity/exposures/base.py +++ b/climada/entity/exposures/base.py @@ -29,6 +29,7 @@ import cartopy.crs as ccrs import contextily as ctx +import geopandas as gpd import matplotlib.pyplot as plt import numpy as np import pandas as pd @@ -37,7 +38,6 @@ from geopandas import GeoDataFrame, GeoSeries, points_from_xy from mpl_toolkits.axes_grid1 import make_axes_locatable from rasterio.warp import Resampling -from xarray import DataArray import climada.util.coordinates as u_coord import climada.util.hdf5_handler as u_hdf5 @@ -1122,30 +1122,31 @@ def plot_basemap( self.to_crs(crs_ori, inplace=True) return axis - def write_hdf5(self, file_name, pickle_geometry=False): + def write_hdf5(self, file_name, pickle_geometry_as_shapely=False): """Write data frame and metadata in hdf5 format Parameters ---------- file_name : str (path and) file name to write to. - pickle_geometry : bool + pickle_geometry_as_shapely : bool flag, indicating whether the "geometry" of the Exposures` `data` will be stored as - pickled shapely objects instead of wkb bytes. This is faster but less durable, because - pickled data may get unreadable for future shapely versions. + pickled shapely objects instead of wkb bytes. This has been the case for earlier + CLIMADA version, up to 6.0, and is perhaps faster but less durable, + because pickled data may evantually get unreadable for future shapely versions. Default: False """ LOGGER.info("Writing %s", file_name) store = pd.HDFStore(file_name, mode="w") - pandas_df = pd.DataFrame(self.data) - wkb_data = {} + pandas_df = pd.DataFrame(self.gdf) + wkb_columns = [] for col in pandas_df.columns: if str(pandas_df[col].dtype) == "geometry": - if pickle_geometry: - pandas_df[col] = np.asarray(self.data[col]) + if pickle_geometry_as_shapely: + pandas_df[col] = np.asarray(self.gdf[col]) else: - wkb_data[col] = to_wkb_store(self.geometry) - pandas_df.drop(columns=[col], inplace=True) + pandas_df[col] = gpd.GeoSeries.to_wkb(pandas_df[col]) + wkb_columns.append(col) # Avoid pandas PerformanceWarning when writing HDF5 data with warnings.catch_warnings(): @@ -1153,13 +1154,11 @@ def write_hdf5(self, file_name, pickle_geometry=False): # Write dataframe store.put("exposures", pandas_df) - if wkb_data: - store.put("wkb_data", wkb_data) - var_meta = {} for var in type(self)._metadata: var_meta[var] = getattr(self, var) var_meta["crs"] = self.crs + var_meta["wkb_columns"] = wkb_columns store.get_storer("exposures").attrs.metadata = var_meta store.close() @@ -1199,12 +1198,13 @@ def from_hdf5(cls, file_name): if crs is None and metadata.get("meta"): crs = metadata["meta"].get("crs") data = pd.DataFrame(store["exposures"]) - try: - wkb_data = store.get("wkb_data") - except KeyError: - wkb_data = {} - for col, val in wkb_data.items(): - data[col] = from_wkb_store(val) + + wkb_columns = ( + metadata.pop("wkb_columns") if "wkb_columns" in metadata else [] + ) + for col in wkb_columns: + data[col] = gpd.GeoSeries.from_wkb(data[col]) + exp = cls(data, crs=crs) for key, val in metadata.items(): if key in type(exp)._metadata: # pylint: disable=protected-access @@ -1574,21 +1574,6 @@ def _read_mat_optional(exposures, data, var_names): pass -def to_wkb_store(geometry: np.array, store): - wkb_data = geometry.to_wkb().to_numpy() - import h5py - - wkb_dataset = h5py.Dataset(store) - - # Store WKB as variable-length byte arrays - dt = h5py.vlen_dtype(np.dtype("uint8")) - wkb_dataset.dtype = dt - for i, geom_bytes in enumerate(wkb_data): - wkb_dataset[i] = np.frombuffer(geom_bytes, dtype="uint8") - - return wkb_data - - def _read_mat_metadata(exposures, data, file_name, var_names): """Fill metadata in DataFrame object""" try: diff --git a/climada/entity/exposures/test/test_base.py b/climada/entity/exposures/test/test_base.py index 66e921cd4..55752785f 100644 --- a/climada/entity/exposures/test/test_base.py +++ b/climada/entity/exposures/test/test_base.py @@ -378,11 +378,11 @@ def test_read_template_pass(self): def test_io_hdf5_pass(self): """write and read hdf5""" - exp_df = Exposures(pd.read_excel(ENT_TEMPLATE_XLS), crs="epsg:32632") - exp_df.check() + exp = Exposures(pd.read_excel(ENT_TEMPLATE_XLS), crs="epsg:32632") + # set metadata - exp_df.ref_year = 2020 - exp_df.value_unit = "XSD" + exp.ref_year = 2020 + exp.value_unit = "XSD" file_name = DATA_DIR.joinpath("test_hdf5_exp.h5") @@ -390,48 +390,51 @@ def test_io_hdf5_pass(self): # PerformanceWarning would result in test failure here import warnings - with warnings.catch_warnings(): - warnings.simplefilter("error", category=pd.errors.PerformanceWarning) - exp_df.write_hdf5(file_name) + for pickle_geometry_as_shapely in [False, True]: + with warnings.catch_warnings(): + warnings.simplefilter("error", category=pd.errors.PerformanceWarning) + exp.write_hdf5( + file_name, pickle_geometry_as_shapely=pickle_geometry_as_shapely + ) - exp_read = Exposures.from_hdf5(file_name) + exp_read = Exposures.from_hdf5(file_name) - self.assertEqual(exp_df.ref_year, exp_read.ref_year) - self.assertEqual(exp_df.value_unit, exp_read.value_unit) - self.assertEqual(exp_df.description, exp_read.description) - np.testing.assert_array_equal(exp_df.latitude, exp_read.latitude) - np.testing.assert_array_equal(exp_df.longitude, exp_read.longitude) - np.testing.assert_array_equal(exp_df.value, exp_read.value) - np.testing.assert_array_equal( - exp_df.data["deductible"].values, exp_read.data["deductible"].values - ) - np.testing.assert_array_equal( - exp_df.data["cover"].values, exp_read.data["cover"].values - ) - np.testing.assert_array_equal( - exp_df.data["region_id"].values, exp_read.data["region_id"].values - ) - np.testing.assert_array_equal( - exp_df.data["category_id"].values, exp_read.data["category_id"].values - ) - np.testing.assert_array_equal( - exp_df.data["impf_TC"].values, exp_read.data["impf_TC"].values - ) - np.testing.assert_array_equal( - exp_df.data["centr_TC"].values, exp_read.data["centr_TC"].values - ) - np.testing.assert_array_equal( - exp_df.data["impf_FL"].values, exp_read.data["impf_FL"].values - ) - np.testing.assert_array_equal( - exp_df.data["centr_FL"].values, exp_read.data["centr_FL"].values - ) + self.assertEqual(exp.ref_year, exp_read.ref_year) + self.assertEqual(exp.value_unit, exp_read.value_unit) + self.assertEqual(exp.description, exp_read.description) + np.testing.assert_array_equal(exp.latitude, exp_read.latitude) + np.testing.assert_array_equal(exp.longitude, exp_read.longitude) + np.testing.assert_array_equal(exp.value, exp_read.value) + np.testing.assert_array_equal( + exp.data["deductible"].values, exp_read.data["deductible"].values + ) + np.testing.assert_array_equal( + exp.data["cover"].values, exp_read.data["cover"].values + ) + np.testing.assert_array_equal( + exp.data["region_id"].values, exp_read.data["region_id"].values + ) + np.testing.assert_array_equal( + exp.data["category_id"].values, exp_read.data["category_id"].values + ) + np.testing.assert_array_equal( + exp.data["impf_TC"].values, exp_read.data["impf_TC"].values + ) + np.testing.assert_array_equal( + exp.data["centr_TC"].values, exp_read.data["centr_TC"].values + ) + np.testing.assert_array_equal( + exp.data["impf_FL"].values, exp_read.data["impf_FL"].values + ) + np.testing.assert_array_equal( + exp.data["centr_FL"].values, exp_read.data["centr_FL"].values + ) - self.assertTrue( - u_coord.equal_crs(exp_df.crs, exp_read.crs), - f"{exp_df.crs} and {exp_read.crs} are different", - ) - self.assertTrue(u_coord.equal_crs(exp_df.gdf.crs, exp_read.gdf.crs)) + self.assertTrue( + u_coord.equal_crs(exp.crs, exp_read.crs), + f"{exp.crs} and {exp_read.crs} are different", + ) + self.assertTrue(u_coord.equal_crs(exp.gdf.crs, exp_read.gdf.crs)) class TestAddSea(unittest.TestCase): From 347f1f9a77f818dc6cf7db9ee1736e90f2b1928f Mon Sep 17 00:00:00 2001 From: Emanuel Schmid <51439563+emanuel-schmid@users.noreply.github.com> Date: Tue, 6 May 2025 17:50:11 +0200 Subject: [PATCH 4/8] Update climada/entity/exposures/base.py Co-authored-by: Lukas Riedel <34276446+peanutfun@users.noreply.github.com> --- climada/entity/exposures/base.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/climada/entity/exposures/base.py b/climada/entity/exposures/base.py index fdcde8e6e..5c687d80d 100644 --- a/climada/entity/exposures/base.py +++ b/climada/entity/exposures/base.py @@ -1145,7 +1145,7 @@ def write_hdf5(self, file_name, pickle_geometry_as_shapely=False): if pickle_geometry_as_shapely: pandas_df[col] = np.asarray(self.gdf[col]) else: - pandas_df[col] = gpd.GeoSeries.to_wkb(pandas_df[col]) + pandas_df[col] = gpd.GeoSeries(pandas_df[col]).to_wkb() wkb_columns.append(col) # Avoid pandas PerformanceWarning when writing HDF5 data From 9312b3044d45ff28aa091cf05985dbf348eefcd4 Mon Sep 17 00:00:00 2001 From: emanuel-schmid Date: Thu, 8 May 2025 09:46:30 +0200 Subject: [PATCH 5/8] abandon shapely pickling --- CHANGELOG.md | 2 +- climada/entity/exposures/base.py | 15 +--- climada/entity/exposures/test/test_base.py | 85 +++++++++++----------- 3 files changed, 45 insertions(+), 57 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 74780084e..373ee9cfe 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -22,7 +22,7 @@ Removed: - `Hazard.local_exceedance_intensity`, `Hazard.local_return_period` and `Impact.local_exceedance_impact`, `Impact.local_return_period`, using the `climada.util.interpolation` module: New default (no binning), binning on decimals, and faster implementation [#1012](https://github.com/CLIMADA-project/climada_python/pull/1012) - World Bank indicator data is now downloaded directly from their API via the function `download_world_bank_indicator`, instead of relying on the `pandas-datareader` package [#1033](https://github.com/CLIMADA-project/climada_python/pull/1033) -- `Exposures.write_hdf5` pickles geometry data in WKB format by default, and not as `shapely` objects anymore. There is now a flag to keep the previous behavior. +- `Exposures.write_hdf5` pickles geometry data in WKB format, which is faster and more sustainable. [#1051](https://github.com/CLIMADA-project/climada_python/pull/1051) ### Fixed diff --git a/climada/entity/exposures/base.py b/climada/entity/exposures/base.py index 5c687d80d..71545a6ef 100644 --- a/climada/entity/exposures/base.py +++ b/climada/entity/exposures/base.py @@ -1122,19 +1122,13 @@ def plot_basemap( self.to_crs(crs_ori, inplace=True) return axis - def write_hdf5(self, file_name, pickle_geometry_as_shapely=False): + def write_hdf5(self, file_name): """Write data frame and metadata in hdf5 format Parameters ---------- file_name : str (path and) file name to write to. - pickle_geometry_as_shapely : bool - flag, indicating whether the "geometry" of the Exposures` `data` will be stored as - pickled shapely objects instead of wkb bytes. This has been the case for earlier - CLIMADA version, up to 6.0, and is perhaps faster but less durable, - because pickled data may evantually get unreadable for future shapely versions. - Default: False """ LOGGER.info("Writing %s", file_name) store = pd.HDFStore(file_name, mode="w") @@ -1142,11 +1136,8 @@ def write_hdf5(self, file_name, pickle_geometry_as_shapely=False): wkb_columns = [] for col in pandas_df.columns: if str(pandas_df[col].dtype) == "geometry": - if pickle_geometry_as_shapely: - pandas_df[col] = np.asarray(self.gdf[col]) - else: - pandas_df[col] = gpd.GeoSeries(pandas_df[col]).to_wkb() - wkb_columns.append(col) + pandas_df[col] = gpd.GeoSeries(pandas_df[col]).to_wkb() + wkb_columns.append(col) # Avoid pandas PerformanceWarning when writing HDF5 data with warnings.catch_warnings(): diff --git a/climada/entity/exposures/test/test_base.py b/climada/entity/exposures/test/test_base.py index 55752785f..3a4ee3663 100644 --- a/climada/entity/exposures/test/test_base.py +++ b/climada/entity/exposures/test/test_base.py @@ -390,51 +390,48 @@ def test_io_hdf5_pass(self): # PerformanceWarning would result in test failure here import warnings - for pickle_geometry_as_shapely in [False, True]: - with warnings.catch_warnings(): - warnings.simplefilter("error", category=pd.errors.PerformanceWarning) - exp.write_hdf5( - file_name, pickle_geometry_as_shapely=pickle_geometry_as_shapely - ) - - exp_read = Exposures.from_hdf5(file_name) - - self.assertEqual(exp.ref_year, exp_read.ref_year) - self.assertEqual(exp.value_unit, exp_read.value_unit) - self.assertEqual(exp.description, exp_read.description) - np.testing.assert_array_equal(exp.latitude, exp_read.latitude) - np.testing.assert_array_equal(exp.longitude, exp_read.longitude) - np.testing.assert_array_equal(exp.value, exp_read.value) - np.testing.assert_array_equal( - exp.data["deductible"].values, exp_read.data["deductible"].values - ) - np.testing.assert_array_equal( - exp.data["cover"].values, exp_read.data["cover"].values - ) - np.testing.assert_array_equal( - exp.data["region_id"].values, exp_read.data["region_id"].values - ) - np.testing.assert_array_equal( - exp.data["category_id"].values, exp_read.data["category_id"].values - ) - np.testing.assert_array_equal( - exp.data["impf_TC"].values, exp_read.data["impf_TC"].values - ) - np.testing.assert_array_equal( - exp.data["centr_TC"].values, exp_read.data["centr_TC"].values - ) - np.testing.assert_array_equal( - exp.data["impf_FL"].values, exp_read.data["impf_FL"].values - ) - np.testing.assert_array_equal( - exp.data["centr_FL"].values, exp_read.data["centr_FL"].values - ) + with warnings.catch_warnings(): + warnings.simplefilter("error", category=pd.errors.PerformanceWarning) + exp.write_hdf5(file_name=file_name) + + exp_read = Exposures.from_hdf5(file_name) + + self.assertEqual(exp.ref_year, exp_read.ref_year) + self.assertEqual(exp.value_unit, exp_read.value_unit) + self.assertEqual(exp.description, exp_read.description) + np.testing.assert_array_equal(exp.latitude, exp_read.latitude) + np.testing.assert_array_equal(exp.longitude, exp_read.longitude) + np.testing.assert_array_equal(exp.value, exp_read.value) + np.testing.assert_array_equal( + exp.data["deductible"].values, exp_read.data["deductible"].values + ) + np.testing.assert_array_equal( + exp.data["cover"].values, exp_read.data["cover"].values + ) + np.testing.assert_array_equal( + exp.data["region_id"].values, exp_read.data["region_id"].values + ) + np.testing.assert_array_equal( + exp.data["category_id"].values, exp_read.data["category_id"].values + ) + np.testing.assert_array_equal( + exp.data["impf_TC"].values, exp_read.data["impf_TC"].values + ) + np.testing.assert_array_equal( + exp.data["centr_TC"].values, exp_read.data["centr_TC"].values + ) + np.testing.assert_array_equal( + exp.data["impf_FL"].values, exp_read.data["impf_FL"].values + ) + np.testing.assert_array_equal( + exp.data["centr_FL"].values, exp_read.data["centr_FL"].values + ) - self.assertTrue( - u_coord.equal_crs(exp.crs, exp_read.crs), - f"{exp.crs} and {exp_read.crs} are different", - ) - self.assertTrue(u_coord.equal_crs(exp.gdf.crs, exp_read.gdf.crs)) + self.assertTrue( + u_coord.equal_crs(exp.crs, exp_read.crs), + f"{exp.crs} and {exp_read.crs} are different", + ) + self.assertTrue(u_coord.equal_crs(exp.gdf.crs, exp_read.gdf.crs)) class TestAddSea(unittest.TestCase): From 8973490ec399641f58609464d7c37a72f1979801 Mon Sep 17 00:00:00 2001 From: emanuel-schmid Date: Mon, 12 May 2025 08:38:12 +0200 Subject: [PATCH 6/8] simplify wkb columns collection --- climada/entity/exposures/base.py | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/climada/entity/exposures/base.py b/climada/entity/exposures/base.py index 71545a6ef..634c660fc 100644 --- a/climada/entity/exposures/base.py +++ b/climada/entity/exposures/base.py @@ -1133,11 +1133,9 @@ def write_hdf5(self, file_name): LOGGER.info("Writing %s", file_name) store = pd.HDFStore(file_name, mode="w") pandas_df = pd.DataFrame(self.gdf) - wkb_columns = [] - for col in pandas_df.columns: - if str(pandas_df[col].dtype) == "geometry": - pandas_df[col] = gpd.GeoSeries(pandas_df[col]).to_wkb() - wkb_columns.append(col) + geocols = self.gdf.columns[self.gdf.dtypes == "geometry"].to_list() + for col in geocols: + pandas_df[col] = gpd.GeoSeries(pandas_df[col]).to_wkb() # Avoid pandas PerformanceWarning when writing HDF5 data with warnings.catch_warnings(): @@ -1149,7 +1147,7 @@ def write_hdf5(self, file_name): for var in type(self)._metadata: var_meta[var] = getattr(self, var) var_meta["crs"] = self.crs - var_meta["wkb_columns"] = wkb_columns + var_meta["wkb_columns"] = geocols store.get_storer("exposures").attrs.metadata = var_meta store.close() From 5ff1e19fc0eaa45d5c2df26e788cc0ce64c8ec19 Mon Sep 17 00:00:00 2001 From: emanuel-schmid Date: Mon, 12 May 2025 08:58:07 +0200 Subject: [PATCH 7/8] simplify wkb conversion --- climada/entity/exposures/base.py | 4 +--- climada/entity/exposures/test/test_base.py | 10 +++++++++- 2 files changed, 10 insertions(+), 4 deletions(-) diff --git a/climada/entity/exposures/base.py b/climada/entity/exposures/base.py index 634c660fc..98d6c21cf 100644 --- a/climada/entity/exposures/base.py +++ b/climada/entity/exposures/base.py @@ -1132,10 +1132,8 @@ def write_hdf5(self, file_name): """ LOGGER.info("Writing %s", file_name) store = pd.HDFStore(file_name, mode="w") - pandas_df = pd.DataFrame(self.gdf) geocols = self.gdf.columns[self.gdf.dtypes == "geometry"].to_list() - for col in geocols: - pandas_df[col] = gpd.GeoSeries(pandas_df[col]).to_wkb() + pandas_df = self.data.to_wkb() # Avoid pandas PerformanceWarning when writing HDF5 data with warnings.catch_warnings(): diff --git a/climada/entity/exposures/test/test_base.py b/climada/entity/exposures/test/test_base.py index 3a4ee3663..77e1e50ec 100644 --- a/climada/entity/exposures/test/test_base.py +++ b/climada/entity/exposures/test/test_base.py @@ -384,6 +384,9 @@ def test_io_hdf5_pass(self): exp.ref_year = 2020 exp.value_unit = "XSD" + # add another geometry column + exp.data["geocol2"] = exp.data.geometry.copy(deep=True) + file_name = DATA_DIR.joinpath("test_hdf5_exp.h5") # pd.errors.PerformanceWarning should be suppressed. Therefore, make sure that @@ -431,7 +434,12 @@ def test_io_hdf5_pass(self): u_coord.equal_crs(exp.crs, exp_read.crs), f"{exp.crs} and {exp_read.crs} are different", ) - self.assertTrue(u_coord.equal_crs(exp.gdf.crs, exp_read.gdf.crs)) + self.assertTrue(u_coord.equal_crs(exp.data.crs, exp_read.data.crs)) + + self.assertTrue(exp_read.data["geocol2"].dtype == "geometry") + np.testing.assert_array_equal( + exp.data["geocol2"].geometry, exp_read.data["geocol2"].values + ) class TestAddSea(unittest.TestCase): From dea3e9993abd40b8a4a8fa1ba17d03ba8602d746 Mon Sep 17 00:00:00 2001 From: emanuel-schmid Date: Mon, 12 May 2025 09:02:33 +0200 Subject: [PATCH 8/8] cosmetics --- climada/entity/exposures/base.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/climada/entity/exposures/base.py b/climada/entity/exposures/base.py index 98d6c21cf..1021dc7ab 100644 --- a/climada/entity/exposures/base.py +++ b/climada/entity/exposures/base.py @@ -1132,7 +1132,7 @@ def write_hdf5(self, file_name): """ LOGGER.info("Writing %s", file_name) store = pd.HDFStore(file_name, mode="w") - geocols = self.gdf.columns[self.gdf.dtypes == "geometry"].to_list() + geocols = self.data.columns[self.data.dtypes == "geometry"].to_list() pandas_df = self.data.to_wkb() # Avoid pandas PerformanceWarning when writing HDF5 data