diff --git a/CHANGELOG.md b/CHANGELOG.md index 42757783c..373ee9cfe 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -15,18 +15,23 @@ Removed: - `pandas-datareader` ### Added + - Added instructions to install Climada petals on Euler cluster in `doc.guide.Guide_Euler.ipynb` [#1029](https://github.com/CLIMADA-project/climada_python/pull/1029) ### Changed + - `Hazard.local_exceedance_intensity`, `Hazard.local_return_period` and `Impact.local_exceedance_impact`, `Impact.local_return_period`, using the `climada.util.interpolation` module: New default (no binning), binning on decimals, and faster implementation [#1012](https://github.com/CLIMADA-project/climada_python/pull/1012) - World Bank indicator data is now downloaded directly from their API via the function `download_world_bank_indicator`, instead of relying on the `pandas-datareader` package [#1033](https://github.com/CLIMADA-project/climada_python/pull/1033) +- `Exposures.write_hdf5` pickles geometry data in WKB format, which is faster and more sustainable. [#1051](https://github.com/CLIMADA-project/climada_python/pull/1051) ### Fixed + - NaN plotting issues in `geo_im_from_array`[#1038](https://github.com/CLIMADA-project/climada_python/pull/1038) ### Deprecated ### Removed + - `climada.util.interpolation.round_to_sig_digits` [#1012](https://github.com/CLIMADA-project/climada_python/pull/1012) ## 6.0.1 diff --git a/climada/entity/exposures/base.py b/climada/entity/exposures/base.py index f437d2d46..1021dc7ab 100644 --- a/climada/entity/exposures/base.py +++ b/climada/entity/exposures/base.py @@ -29,6 +29,7 @@ import cartopy.crs as ccrs import contextily as ctx +import geopandas as gpd import matplotlib.pyplot as plt import numpy as np import pandas as pd @@ -1131,10 +1132,8 @@ def write_hdf5(self, file_name): """ LOGGER.info("Writing %s", file_name) store = pd.HDFStore(file_name, mode="w") - pandas_df = pd.DataFrame(self.gdf) - for col in pandas_df.columns: - if str(pandas_df[col].dtype) == "geometry": - pandas_df[col] = np.asarray(self.gdf[col]) + geocols = self.data.columns[self.data.dtypes == "geometry"].to_list() + pandas_df = self.data.to_wkb() # Avoid pandas PerformanceWarning when writing HDF5 data with warnings.catch_warnings(): @@ -1146,6 +1145,7 @@ def write_hdf5(self, file_name): for var in type(self)._metadata: var_meta[var] = getattr(self, var) var_meta["crs"] = self.crs + var_meta["wkb_columns"] = geocols store.get_storer("exposures").attrs.metadata = var_meta store.close() @@ -1184,7 +1184,15 @@ def from_hdf5(cls, file_name): crs = metadata.get("crs", metadata.get("_crs")) if crs is None and metadata.get("meta"): crs = metadata["meta"].get("crs") - exp = cls(store["exposures"], crs=crs) + data = pd.DataFrame(store["exposures"]) + + wkb_columns = ( + metadata.pop("wkb_columns") if "wkb_columns" in metadata else [] + ) + for col in wkb_columns: + data[col] = gpd.GeoSeries.from_wkb(data[col]) + + exp = cls(data, crs=crs) for key, val in metadata.items(): if key in type(exp)._metadata: # pylint: disable=protected-access setattr(exp, key, val) diff --git a/climada/entity/exposures/test/test_base.py b/climada/entity/exposures/test/test_base.py index 66e921cd4..77e1e50ec 100644 --- a/climada/entity/exposures/test/test_base.py +++ b/climada/entity/exposures/test/test_base.py @@ -378,11 +378,14 @@ def test_read_template_pass(self): def test_io_hdf5_pass(self): """write and read hdf5""" - exp_df = Exposures(pd.read_excel(ENT_TEMPLATE_XLS), crs="epsg:32632") - exp_df.check() + exp = Exposures(pd.read_excel(ENT_TEMPLATE_XLS), crs="epsg:32632") + # set metadata - exp_df.ref_year = 2020 - exp_df.value_unit = "XSD" + exp.ref_year = 2020 + exp.value_unit = "XSD" + + # add another geometry column + exp.data["geocol2"] = exp.data.geometry.copy(deep=True) file_name = DATA_DIR.joinpath("test_hdf5_exp.h5") @@ -392,46 +395,51 @@ def test_io_hdf5_pass(self): with warnings.catch_warnings(): warnings.simplefilter("error", category=pd.errors.PerformanceWarning) - exp_df.write_hdf5(file_name) + exp.write_hdf5(file_name=file_name) exp_read = Exposures.from_hdf5(file_name) - self.assertEqual(exp_df.ref_year, exp_read.ref_year) - self.assertEqual(exp_df.value_unit, exp_read.value_unit) - self.assertEqual(exp_df.description, exp_read.description) - np.testing.assert_array_equal(exp_df.latitude, exp_read.latitude) - np.testing.assert_array_equal(exp_df.longitude, exp_read.longitude) - np.testing.assert_array_equal(exp_df.value, exp_read.value) + self.assertEqual(exp.ref_year, exp_read.ref_year) + self.assertEqual(exp.value_unit, exp_read.value_unit) + self.assertEqual(exp.description, exp_read.description) + np.testing.assert_array_equal(exp.latitude, exp_read.latitude) + np.testing.assert_array_equal(exp.longitude, exp_read.longitude) + np.testing.assert_array_equal(exp.value, exp_read.value) np.testing.assert_array_equal( - exp_df.data["deductible"].values, exp_read.data["deductible"].values + exp.data["deductible"].values, exp_read.data["deductible"].values ) np.testing.assert_array_equal( - exp_df.data["cover"].values, exp_read.data["cover"].values + exp.data["cover"].values, exp_read.data["cover"].values ) np.testing.assert_array_equal( - exp_df.data["region_id"].values, exp_read.data["region_id"].values + exp.data["region_id"].values, exp_read.data["region_id"].values ) np.testing.assert_array_equal( - exp_df.data["category_id"].values, exp_read.data["category_id"].values + exp.data["category_id"].values, exp_read.data["category_id"].values ) np.testing.assert_array_equal( - exp_df.data["impf_TC"].values, exp_read.data["impf_TC"].values + exp.data["impf_TC"].values, exp_read.data["impf_TC"].values ) np.testing.assert_array_equal( - exp_df.data["centr_TC"].values, exp_read.data["centr_TC"].values + exp.data["centr_TC"].values, exp_read.data["centr_TC"].values ) np.testing.assert_array_equal( - exp_df.data["impf_FL"].values, exp_read.data["impf_FL"].values + exp.data["impf_FL"].values, exp_read.data["impf_FL"].values ) np.testing.assert_array_equal( - exp_df.data["centr_FL"].values, exp_read.data["centr_FL"].values + exp.data["centr_FL"].values, exp_read.data["centr_FL"].values ) self.assertTrue( - u_coord.equal_crs(exp_df.crs, exp_read.crs), - f"{exp_df.crs} and {exp_read.crs} are different", + u_coord.equal_crs(exp.crs, exp_read.crs), + f"{exp.crs} and {exp_read.crs} are different", + ) + self.assertTrue(u_coord.equal_crs(exp.data.crs, exp_read.data.crs)) + + self.assertTrue(exp_read.data["geocol2"].dtype == "geometry") + np.testing.assert_array_equal( + exp.data["geocol2"].geometry, exp_read.data["geocol2"].values ) - self.assertTrue(u_coord.equal_crs(exp_df.gdf.crs, exp_read.gdf.crs)) class TestAddSea(unittest.TestCase):