Remove pandas-datareader (#1033)

peanutfun · emanuel-schmid · web-flow · commit 4d2d690dd4f1 · 2025-04-16T09:41:20.000+02:00
* Remove pandas-datareader Use JSON/pandas solution for downloading World Bank indicator data. * Add function `download_world_bank_indicator`. * Add unit test. * Update requirements. * Update CHANGELOG.md * Remove stray print and fix comments * Switch to compatible Petals target branch for testing REVERT THIS! * Fix linter warnings - Add timeout parameter to requests call - Remove unused import * #168 is merged Co-authored-by: Lukas Riedel <34276446+peanutfun@users.noreply.github.com> * Apply suggestions from code review Use single list instead of nested lists Co-authored-by: Emanuel Schmid <51439563+emanuel-schmid@users.noreply.github.com> * Update reading WB data * Fall back to parsing dates if conversion to ints fails. * Throw a ValueError if no data is available. --------- Co-authored-by: emanuel-schmid <schmide@ethz.ch> Co-authored-by: Emanuel Schmid <51439563+emanuel-schmid@users.noreply.github.com>
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -10,11 +10,17 @@ Code freeze date: YYYY-MM-DD
 
 ### Dependency Changes
 
+Removed:
+
+- `pandas-datareader`
+
 ### Added
 - Added instructions to install Climada petals on Euler cluster in `doc.guide.Guide_Euler.ipynb` [#1029](https://github.com/CLIMADA-project/climada_python/pull/1029)
 
 ### Changed
 - `Hazard.local_exceedance_intensity`, `Hazard.local_return_period` and `Impact.local_exceedance_impact`, `Impact.local_return_period`, using the `climada.util.interpolation` module: New default (no binning), binning on decimals, and faster implementation [#1012](https://github.com/CLIMADA-project/climada_python/pull/1012)
+- World Bank indicator data is now downloaded directly from their API via the function `download_world_bank_indicator`, instead of relying on the `pandas-datareader` package [#1033](https://github.com/CLIMADA-project/climada_python/pull/1033)
+
 ### Fixed
 - NaN plotting issues in `geo_im_from_array`[#1038](https://github.com/CLIMADA-project/climada_python/pull/1038)
 
diff --git a/climada/util/finance.py b/climada/util/finance.py
@@ -21,17 +21,16 @@
 
 __all__ = ["net_present_value", "income_group", "gdp"]
 
+import json
 import logging
 import shutil
-import warnings
 import zipfile
 from pathlib import Path
 
 import numpy as np
 import pandas as pd
 import requests
 from cartopy.io import shapereader
-from pandas_datareader import wb
 
 from climada.util.constants import SYSTEM_DIR
 from climada.util.files_handler import download_file
@@ -181,6 +180,77 @@ def gdp(cntry_iso, ref_year, shp_file=None, per_capita=False):
     return close_year, close_val
 
 
+def download_world_bank_indicator(
+    country_code: str, indicator: str, parse_dates: bool = False
+):
+    """Download indicator data from the World Bank API for all years or dates on record
+
+    Parameters
+    ----------
+    country_code : str
+        The country code in ISO alpha 3
+    indicator : str
+        The ID of the indicator in the World Bank API
+    parse_dates : bool, optional
+        Whether the dates of the indicator data should be parsed as datetime objects.
+        If ``False`` (default), this will first try to parse them as ``int`` (this only
+        works for yearly data), and then parse as datetime objects if that fails.
+
+    Returns
+    -------
+    pd.Series
+        A series with the values of the indicator for all dates (years) on record
+    """
+    # Download data from API
+    raw_data = []
+    pages = np.inf
+    page = 1
+    while page <= pages:
+        response = requests.get(
+            f"https://api.worldbank.org/v2/countries/{country_code}/indicators/"
+            f"{indicator}?format=json&page={page}",
+            timeout=30,
+        )
+        json_data = json.loads(response.text)
+
+        # Check if we received an error message
+        try:
+            if json_data[0]["message"][0]["id"] == "120":
+                raise RuntimeError(
+                    "Error requesting data from the World Bank API. Did you use the "
+                    "correct country code and indicator ID?"
+                )
+        # If no, we should be fine
+        except KeyError:
+            pass
+
+        # Check if there is no data available
+        pages = json_data[0]["pages"]
+        if pages == 0:
+            raise ValueError(
+                f"No data available for country {country_code}, indicator {indicator}"
+            )
+
+        # Update the data
+        page = page + 1
+        raw_data.extend(json_data[1])
+
+    # Create dataframe
+    data = pd.DataFrame.from_records(raw_data)
+
+    # Maybe parse dates
+    if parse_dates:
+        data["date"] = pd.DatetimeIndex(data["date"])
+    else:
+        try:
+            data["date"] = data["date"].astype("int")
+        except TypeError:
+            data["date"] = pd.DatetimeIndex(data["date"])
+
+    # Only return indicator data (with a proper name)
+    return data.set_index("date")["value"].rename(data["indicator"].iloc[0]["value"])
+
+
 def world_bank(cntry_iso, ref_year, info_ind):
     """Get country's GDP from World Bank's data at a given year, or
     closest year value. If no data, get the natural earth's approximation.
@@ -204,18 +274,14 @@ def world_bank(cntry_iso, ref_year, info_ind):
     IOError, KeyError, IndexError
     """
     if info_ind != "INC_GRP":
-        with warnings.catch_warnings():
-            warnings.simplefilter("ignore")
-            cntry_gdp = wb.download(
-                indicator=info_ind, country=cntry_iso, start=1960, end=2030
-            )
-        years = np.array(
-            [int(year) for year in cntry_gdp.index.get_level_values("year")]
+        cntry_gdp = download_world_bank_indicator(
+            indicator=info_ind, country_code=cntry_iso, parse_dates=False
         )
+        years = cntry_gdp.index
         sort_years = np.abs(years - ref_year).argsort()
         close_val = cntry_gdp.iloc[sort_years].dropna()
-        close_year = int(close_val.iloc[0].name[1])
-        close_val = float(close_val.iloc[0].values)
+        close_year = close_val.index[0]
+        close_val = float(close_val.iloc[0])
     else:  # income group level
         fn_ig = SYSTEM_DIR.joinpath("OGHIST.xls")
         dfr_wb = pd.DataFrame()
diff --git a/climada/util/test/test_finance.py b/climada/util/test/test_finance.py
@@ -26,6 +26,7 @@
 
 from climada.util.finance import (
     _gdp_twn,
+    download_world_bank_indicator,
     gdp,
     income_group,
     nat_earth_adm0,
@@ -137,6 +138,34 @@ def test_wb_esp_1950_pass(self):
         self.assertEqual(wb_year, ref_year)
         self.assertAlmostEqual(wb_val, ref_val)
 
+    def test_download_wb_data(self):
+        """Test downloading data via the API"""
+        # Unfortunate reference test
+        data = download_world_bank_indicator("ESP", "NY.GDP.MKTP.CD")
+        self.assertAlmostEqual(data[1960], 12424514013.7604)
+        self.assertEqual(data.name, "GDP (current US$)")
+
+        # Check parsing dates
+        data = download_world_bank_indicator("ESP", "NY.GDP.MKTP.CD", parse_dates=True)
+        self.assertEqual(data.index[-1], np.datetime64("1960-01-01"))
+
+        # Check errors raised
+        with self.assertRaisesRegex(
+            RuntimeError,
+            "Did you use the correct country code",
+        ):
+            download_world_bank_indicator("Spain", "NY.GDP.MKTP.CD")
+        with self.assertRaisesRegex(
+            RuntimeError,
+            "Did you use the correct country code",
+        ):
+            download_world_bank_indicator("ESP", "BogusIndicator")
+        with self.assertRaisesRegex(
+            ValueError,
+            "No data available for country AIA, indicator NY.GDP.MKTP.CD",
+        ):
+            download_world_bank_indicator("AIA", "NY.GDP.MKTP.CD")
+
 
 class TestWealth2GDP(unittest.TestCase):
     """Test Wealth to GDP factor extraction"""
diff --git a/requirements/env_climada.yml b/requirements/env_climada.yml
@@ -20,7 +20,6 @@ dependencies:
   - openpyxl>=3.1
   - osm-flex>=1.1
   - pandas>=2.1,<2.2  # 2.2 is not compatible with the default pytables=3.7 and yields a very high deprecation warning number through geopandas
-  - pandas-datareader>=0.10
   - pathos>=0.3
   - pint>=0.24
   - pip
diff --git a/setup.py b/setup.py
@@ -77,7 +77,6 @@
         "openpyxl",
         "overpy",
         "pandas",
-        "pandas-datareader",
         "pathos",
         "peewee",
         "pillow",