Merge pull request #1721 from lisphilar/issue1720

Catch up with the change of compressed file format of population data
lisphilar · Aug 18, 2024 · cdda4f0 · cdda4f0
2 parents 91694d5 + 60bc568
commit cdda4f0
Show file tree

Hide file tree

Showing 2 changed files with 21 additions and 14 deletions.
diff --git a/covsirphy/downloading/_db_wpp.py b/covsirphy/downloading/_db_wpp.py
@@ -7,8 +7,9 @@ class _WPP(_DataBase):
     """
     Access "World Population Prospects by United nations" server.
     https://population.un.org/wpp/
+    https://datahelpdesk.worldbank.org/knowledgebase/articles/898581-api-basic-call-structures
     """
-    TOP_URL = "https://population.un.org/wpp/Download/Files/1_Indicators%20(Standard)/CSV_FILES/"
+    TOP_URL = "https://api.worldbank.org/v2/"
     # File title without extensions and suffix
     TITLE = "world-population-prospects"
     # Dictionary of column names
@@ -20,7 +21,7 @@ class _WPP(_DataBase):
     ALL_COLS = [Term.DATE, Term.ISO3, Term.PROVINCE, Term.CITY, Term.N]
     # Stdout when downloading (shown at most one time)
     STDOUT = "Retrieving datasets from World Population Prospects https://population.un.org/wpp/"
-    # Citation
+    # Citations
     CITATION = 'United Nations, Department of Economic and Social Affairs,' \
         ' Population Division (2022). World Population Prospects 2022, Online Edition.'
 
@@ -38,13 +39,14 @@ def _country(self):
                     - City (object): NAs
                     - Population (numpy.float64): population values
         """
-        url = f"{self.TOP_URL}WPP2022_TotalPopulationBySex.zip"
-        df = self._provide(url=url, suffix="_level1", columns=list(self.COL_DICT.keys()))
-        df[self.DATE] = pd.to_datetime(df["Year"], format="%Y") + pd.offsets.DateOffset(months=6)
+        url = f"{self.TOP_URL}country/all/indicator/SP.POP.TOTL?per_page=20000"
+        df = pd.read_xml(url,  parser="etree")
+        df[self.DATE] = pd.to_datetime(df["date"], format="%Y") + pd.offsets.DateOffset(months=6)
+        df = df.rename(columns={"countryiso3code": Term.ISO3})
         df[self.PROVINCE] = self.NA
         df[self.CITY] = self.NA
-        df[self.N] = df[self.N] * 1_000
-        return df.dropna(how="any").loc[:, self.ALL_COLS]
+        df[self.N] = df["value"]
+        return df.loc[:, self.ALL_COLS].dropna(how="any")
 
     def _province(self, country):
         """Returns province-level data.

diff --git a/covsirphy/downloading/_provider.py b/covsirphy/downloading/_provider.py
@@ -7,7 +7,7 @@
 from urllib3 import PoolManager
 from urllib3.util.ssl_ import create_urllib3_context
 import warnings
-from zipfile import ZipFile
+from zipfile import ZipFile, BadZipFile
 import numpy as np
 import pandas as pd
 from unidecode import unidecode
@@ -104,20 +104,25 @@ def read_csv(path, columns, date, date_format):
             "header": 0, "usecols": columns, "encoding": "utf-8", "engine": "pyarrow",
             "parse_dates": None if date is None else [date], "date_format": date_format,
         }
-        if urlparse(path).scheme:
-            kwargs["storage_options"] = {"User-Agent": "Mozilla/5.0"}
         try:
-            df = pd.read_csv(path, **kwargs)
+            df = pd.read_csv(
+                path,
+                storage_options={"User-Agent": "Mozilla/5.0"} if urlparse(path).scheme else None,
+                **kwargs
+            )
         except URLError:
             ctx = create_urllib3_context()
             ctx.load_default_certs()
             # From Python 3.12, use import ssl; ssl.OP_LEGACY_SERVER_CONNECT instead of 0x4
             ctx.options |= 0x4
             with PoolManager(ssl_context=ctx) as http:
                 r = http.request("GET", path)
-                with ZipFile(io.BytesIO(r.data), "r") as fh:
-                    text = fh.read(f"{Path(path).stem}.csv")
-            df = pd.read_csv(io.StringIO(text.decode("utf-8")))
+                try:
+                    with ZipFile(io.BytesIO(r.data), "r") as fh:
+                        text = fh.read(f"{Path(path).stem}.csv")
+                        df = pd.read_csv(io.StringIO(text.decode("utf-8")), **kwargs)
+                except BadZipFile:
+                    df = pd.read_csv(io.BytesIO(r.data), **kwargs)
         for col in df:
             with contextlib.suppress(TypeError):
                 df[col] = df[col].apply(lambda x: unidecode(x) if len(x) else np.nan)