Skip to content

Commit 1a6df8e

Browse files
Avoid pickling shapely object in Exposures.write_hdf5 (#1051)
* refactor Exposures.write_hdf5 and .from_hdf5: use wkb instead of pickle for geometry serialization * refactor Exposures.write_hdf5 * change of plan: just pickle geometries in wkb format * Update climada/entity/exposures/base.py Co-authored-by: Lukas Riedel <[email protected]> * abandon shapely pickling * simplify wkb columns collection * simplify wkb conversion * cosmetics --------- Co-authored-by: Lukas Riedel <[email protected]>
1 parent 51b66fa commit 1a6df8e

File tree

3 files changed

+48
-27
lines changed

3 files changed

+48
-27
lines changed

CHANGELOG.md

+5
Original file line numberDiff line numberDiff line change
@@ -15,18 +15,23 @@ Removed:
1515
- `pandas-datareader`
1616

1717
### Added
18+
1819
- Added instructions to install Climada petals on Euler cluster in `doc.guide.Guide_Euler.ipynb` [#1029](https://github.com/CLIMADA-project/climada_python/pull/1029)
1920

2021
### Changed
22+
2123
- `Hazard.local_exceedance_intensity`, `Hazard.local_return_period` and `Impact.local_exceedance_impact`, `Impact.local_return_period`, using the `climada.util.interpolation` module: New default (no binning), binning on decimals, and faster implementation [#1012](https://github.com/CLIMADA-project/climada_python/pull/1012)
2224
- World Bank indicator data is now downloaded directly from their API via the function `download_world_bank_indicator`, instead of relying on the `pandas-datareader` package [#1033](https://github.com/CLIMADA-project/climada_python/pull/1033)
25+
- `Exposures.write_hdf5` pickles geometry data in WKB format, which is faster and more sustainable. [#1051](https://github.com/CLIMADA-project/climada_python/pull/1051)
2326

2427
### Fixed
28+
2529
- NaN plotting issues in `geo_im_from_array`[#1038](https://github.com/CLIMADA-project/climada_python/pull/1038)
2630

2731
### Deprecated
2832

2933
### Removed
34+
3035
- `climada.util.interpolation.round_to_sig_digits` [#1012](https://github.com/CLIMADA-project/climada_python/pull/1012)
3136

3237
## 6.0.1

climada/entity/exposures/base.py

+13-5
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,7 @@
2929

3030
import cartopy.crs as ccrs
3131
import contextily as ctx
32+
import geopandas as gpd
3233
import matplotlib.pyplot as plt
3334
import numpy as np
3435
import pandas as pd
@@ -1131,10 +1132,8 @@ def write_hdf5(self, file_name):
11311132
"""
11321133
LOGGER.info("Writing %s", file_name)
11331134
store = pd.HDFStore(file_name, mode="w")
1134-
pandas_df = pd.DataFrame(self.gdf)
1135-
for col in pandas_df.columns:
1136-
if str(pandas_df[col].dtype) == "geometry":
1137-
pandas_df[col] = np.asarray(self.gdf[col])
1135+
geocols = self.data.columns[self.data.dtypes == "geometry"].to_list()
1136+
pandas_df = self.data.to_wkb()
11381137

11391138
# Avoid pandas PerformanceWarning when writing HDF5 data
11401139
with warnings.catch_warnings():
@@ -1146,6 +1145,7 @@ def write_hdf5(self, file_name):
11461145
for var in type(self)._metadata:
11471146
var_meta[var] = getattr(self, var)
11481147
var_meta["crs"] = self.crs
1148+
var_meta["wkb_columns"] = geocols
11491149
store.get_storer("exposures").attrs.metadata = var_meta
11501150

11511151
store.close()
@@ -1184,7 +1184,15 @@ def from_hdf5(cls, file_name):
11841184
crs = metadata.get("crs", metadata.get("_crs"))
11851185
if crs is None and metadata.get("meta"):
11861186
crs = metadata["meta"].get("crs")
1187-
exp = cls(store["exposures"], crs=crs)
1187+
data = pd.DataFrame(store["exposures"])
1188+
1189+
wkb_columns = (
1190+
metadata.pop("wkb_columns") if "wkb_columns" in metadata else []
1191+
)
1192+
for col in wkb_columns:
1193+
data[col] = gpd.GeoSeries.from_wkb(data[col])
1194+
1195+
exp = cls(data, crs=crs)
11881196
for key, val in metadata.items():
11891197
if key in type(exp)._metadata: # pylint: disable=protected-access
11901198
setattr(exp, key, val)

climada/entity/exposures/test/test_base.py

+30-22
Original file line numberDiff line numberDiff line change
@@ -378,11 +378,14 @@ def test_read_template_pass(self):
378378

379379
def test_io_hdf5_pass(self):
380380
"""write and read hdf5"""
381-
exp_df = Exposures(pd.read_excel(ENT_TEMPLATE_XLS), crs="epsg:32632")
382-
exp_df.check()
381+
exp = Exposures(pd.read_excel(ENT_TEMPLATE_XLS), crs="epsg:32632")
382+
383383
# set metadata
384-
exp_df.ref_year = 2020
385-
exp_df.value_unit = "XSD"
384+
exp.ref_year = 2020
385+
exp.value_unit = "XSD"
386+
387+
# add another geometry column
388+
exp.data["geocol2"] = exp.data.geometry.copy(deep=True)
386389

387390
file_name = DATA_DIR.joinpath("test_hdf5_exp.h5")
388391

@@ -392,46 +395,51 @@ def test_io_hdf5_pass(self):
392395

393396
with warnings.catch_warnings():
394397
warnings.simplefilter("error", category=pd.errors.PerformanceWarning)
395-
exp_df.write_hdf5(file_name)
398+
exp.write_hdf5(file_name=file_name)
396399

397400
exp_read = Exposures.from_hdf5(file_name)
398401

399-
self.assertEqual(exp_df.ref_year, exp_read.ref_year)
400-
self.assertEqual(exp_df.value_unit, exp_read.value_unit)
401-
self.assertEqual(exp_df.description, exp_read.description)
402-
np.testing.assert_array_equal(exp_df.latitude, exp_read.latitude)
403-
np.testing.assert_array_equal(exp_df.longitude, exp_read.longitude)
404-
np.testing.assert_array_equal(exp_df.value, exp_read.value)
402+
self.assertEqual(exp.ref_year, exp_read.ref_year)
403+
self.assertEqual(exp.value_unit, exp_read.value_unit)
404+
self.assertEqual(exp.description, exp_read.description)
405+
np.testing.assert_array_equal(exp.latitude, exp_read.latitude)
406+
np.testing.assert_array_equal(exp.longitude, exp_read.longitude)
407+
np.testing.assert_array_equal(exp.value, exp_read.value)
405408
np.testing.assert_array_equal(
406-
exp_df.data["deductible"].values, exp_read.data["deductible"].values
409+
exp.data["deductible"].values, exp_read.data["deductible"].values
407410
)
408411
np.testing.assert_array_equal(
409-
exp_df.data["cover"].values, exp_read.data["cover"].values
412+
exp.data["cover"].values, exp_read.data["cover"].values
410413
)
411414
np.testing.assert_array_equal(
412-
exp_df.data["region_id"].values, exp_read.data["region_id"].values
415+
exp.data["region_id"].values, exp_read.data["region_id"].values
413416
)
414417
np.testing.assert_array_equal(
415-
exp_df.data["category_id"].values, exp_read.data["category_id"].values
418+
exp.data["category_id"].values, exp_read.data["category_id"].values
416419
)
417420
np.testing.assert_array_equal(
418-
exp_df.data["impf_TC"].values, exp_read.data["impf_TC"].values
421+
exp.data["impf_TC"].values, exp_read.data["impf_TC"].values
419422
)
420423
np.testing.assert_array_equal(
421-
exp_df.data["centr_TC"].values, exp_read.data["centr_TC"].values
424+
exp.data["centr_TC"].values, exp_read.data["centr_TC"].values
422425
)
423426
np.testing.assert_array_equal(
424-
exp_df.data["impf_FL"].values, exp_read.data["impf_FL"].values
427+
exp.data["impf_FL"].values, exp_read.data["impf_FL"].values
425428
)
426429
np.testing.assert_array_equal(
427-
exp_df.data["centr_FL"].values, exp_read.data["centr_FL"].values
430+
exp.data["centr_FL"].values, exp_read.data["centr_FL"].values
428431
)
429432

430433
self.assertTrue(
431-
u_coord.equal_crs(exp_df.crs, exp_read.crs),
432-
f"{exp_df.crs} and {exp_read.crs} are different",
434+
u_coord.equal_crs(exp.crs, exp_read.crs),
435+
f"{exp.crs} and {exp_read.crs} are different",
436+
)
437+
self.assertTrue(u_coord.equal_crs(exp.data.crs, exp_read.data.crs))
438+
439+
self.assertTrue(exp_read.data["geocol2"].dtype == "geometry")
440+
np.testing.assert_array_equal(
441+
exp.data["geocol2"].geometry, exp_read.data["geocol2"].values
433442
)
434-
self.assertTrue(u_coord.equal_crs(exp_df.gdf.crs, exp_read.gdf.crs))
435443

436444

437445
class TestAddSea(unittest.TestCase):

0 commit comments

Comments
 (0)