Skip to content

Commit

Permalink
Merge pull request #1721 from lisphilar/issue1720
Browse files Browse the repository at this point in the history
Catch up with the change of compressed file format of population data
  • Loading branch information
lisphilar authored Aug 18, 2024
2 parents 91694d5 + 60bc568 commit cdda4f0
Show file tree
Hide file tree
Showing 2 changed files with 21 additions and 14 deletions.
16 changes: 9 additions & 7 deletions covsirphy/downloading/_db_wpp.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,8 +7,9 @@ class _WPP(_DataBase):
"""
Access "World Population Prospects by United nations" server.
https://population.un.org/wpp/
https://datahelpdesk.worldbank.org/knowledgebase/articles/898581-api-basic-call-structures
"""
TOP_URL = "https://population.un.org/wpp/Download/Files/1_Indicators%20(Standard)/CSV_FILES/"
TOP_URL = "https://api.worldbank.org/v2/"
# File title without extensions and suffix
TITLE = "world-population-prospects"
# Dictionary of column names
Expand All @@ -20,7 +21,7 @@ class _WPP(_DataBase):
ALL_COLS = [Term.DATE, Term.ISO3, Term.PROVINCE, Term.CITY, Term.N]
# Stdout when downloading (shown at most one time)
STDOUT = "Retrieving datasets from World Population Prospects https://population.un.org/wpp/"
# Citation
# Citations
CITATION = 'United Nations, Department of Economic and Social Affairs,' \
' Population Division (2022). World Population Prospects 2022, Online Edition.'

Expand All @@ -38,13 +39,14 @@ def _country(self):
- City (object): NAs
- Population (numpy.float64): population values
"""
url = f"{self.TOP_URL}WPP2022_TotalPopulationBySex.zip"
df = self._provide(url=url, suffix="_level1", columns=list(self.COL_DICT.keys()))
df[self.DATE] = pd.to_datetime(df["Year"], format="%Y") + pd.offsets.DateOffset(months=6)
url = f"{self.TOP_URL}country/all/indicator/SP.POP.TOTL?per_page=20000"
df = pd.read_xml(url, parser="etree")
df[self.DATE] = pd.to_datetime(df["date"], format="%Y") + pd.offsets.DateOffset(months=6)
df = df.rename(columns={"countryiso3code": Term.ISO3})
df[self.PROVINCE] = self.NA
df[self.CITY] = self.NA
df[self.N] = df[self.N] * 1_000
return df.dropna(how="any").loc[:, self.ALL_COLS]
df[self.N] = df["value"]
return df.loc[:, self.ALL_COLS].dropna(how="any")

def _province(self, country):
"""Returns province-level data.
Expand Down
19 changes: 12 additions & 7 deletions covsirphy/downloading/_provider.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
from urllib3 import PoolManager
from urllib3.util.ssl_ import create_urllib3_context
import warnings
from zipfile import ZipFile
from zipfile import ZipFile, BadZipFile
import numpy as np
import pandas as pd
from unidecode import unidecode
Expand Down Expand Up @@ -104,20 +104,25 @@ def read_csv(path, columns, date, date_format):
"header": 0, "usecols": columns, "encoding": "utf-8", "engine": "pyarrow",
"parse_dates": None if date is None else [date], "date_format": date_format,
}
if urlparse(path).scheme:
kwargs["storage_options"] = {"User-Agent": "Mozilla/5.0"}
try:
df = pd.read_csv(path, **kwargs)
df = pd.read_csv(
path,
storage_options={"User-Agent": "Mozilla/5.0"} if urlparse(path).scheme else None,
**kwargs
)
except URLError:
ctx = create_urllib3_context()
ctx.load_default_certs()
# From Python 3.12, use import ssl; ssl.OP_LEGACY_SERVER_CONNECT instead of 0x4
ctx.options |= 0x4
with PoolManager(ssl_context=ctx) as http:
r = http.request("GET", path)
with ZipFile(io.BytesIO(r.data), "r") as fh:
text = fh.read(f"{Path(path).stem}.csv")
df = pd.read_csv(io.StringIO(text.decode("utf-8")))
try:
with ZipFile(io.BytesIO(r.data), "r") as fh:
text = fh.read(f"{Path(path).stem}.csv")
df = pd.read_csv(io.StringIO(text.decode("utf-8")), **kwargs)
except BadZipFile:
df = pd.read_csv(io.BytesIO(r.data), **kwargs)
for col in df:
with contextlib.suppress(TypeError):
df[col] = df[col].apply(lambda x: unidecode(x) if len(x) else np.nan)
Expand Down

0 comments on commit cdda4f0

Please sign in to comment.