From e289e419e1e060a21e4887de54e0ea7c320df21c Mon Sep 17 00:00:00 2001 From: lisphilar <7270139+lisphilar@users.noreply.github.com> Date: Fri, 26 Jul 2024 21:58:20 +0900 Subject: [PATCH 01/11] fix #1720 --- covsirphy/downloading/_db_wpp.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/covsirphy/downloading/_db_wpp.py b/covsirphy/downloading/_db_wpp.py index 74c1937ca..e08ecb921 100644 --- a/covsirphy/downloading/_db_wpp.py +++ b/covsirphy/downloading/_db_wpp.py @@ -20,7 +20,7 @@ class _WPP(_DataBase): ALL_COLS = [Term.DATE, Term.ISO3, Term.PROVINCE, Term.CITY, Term.N] # Stdout when downloading (shown at most one time) STDOUT = "Retrieving datasets from World Population Prospects https://population.un.org/wpp/" - # Citation + # Citations CITATION = 'United Nations, Department of Economic and Social Affairs,' \ ' Population Division (2022). World Population Prospects 2022, Online Edition.' @@ -38,7 +38,7 @@ def _country(self): - City (object): NAs - Population (numpy.float64): population values """ - url = f"{self.TOP_URL}WPP2022_TotalPopulationBySex.zip" + url = f"{self.TOP_URL}WPP2022_TotalPopulationBySex.csv.gz" df = self._provide(url=url, suffix="_level1", columns=list(self.COL_DICT.keys())) df[self.DATE] = pd.to_datetime(df["Year"], format="%Y") + pd.offsets.DateOffset(months=6) df[self.PROVINCE] = self.NA From a947f042849b2233182fff085ba6a8a4da732d8a Mon Sep 17 00:00:00 2001 From: lisphilar <7270139+lisphilar@users.noreply.github.com> Date: Fri, 26 Jul 2024 22:09:48 +0900 Subject: [PATCH 02/11] fix: population data update to 2024 version --- covsirphy/downloading/_db_wpp.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/covsirphy/downloading/_db_wpp.py b/covsirphy/downloading/_db_wpp.py index e08ecb921..064731d8c 100644 --- a/covsirphy/downloading/_db_wpp.py +++ b/covsirphy/downloading/_db_wpp.py @@ -38,7 +38,7 @@ def _country(self): - City (object): NAs - Population (numpy.float64): population values """ - url = f"{self.TOP_URL}WPP2022_TotalPopulationBySex.csv.gz" + url = f"{self.TOP_URL}WPP2024_TotalPopulationBySex.csv.gz" df = self._provide(url=url, suffix="_level1", columns=list(self.COL_DICT.keys())) df[self.DATE] = pd.to_datetime(df["Year"], format="%Y") + pd.offsets.DateOffset(months=6) df[self.PROVINCE] = self.NA From db640eb3090822d7b8933d9a47f34a3c5e9a7175 Mon Sep 17 00:00:00 2001 From: lisphilar <7270139+lisphilar@users.noreply.github.com> Date: Fri, 26 Jul 2024 23:01:12 +0900 Subject: [PATCH 03/11] fix: errors --- covsirphy/downloading/_db_wpp.py | 7 ++++++- covsirphy/downloading/_provider.py | 19 ++++++++++++------- 2 files changed, 18 insertions(+), 8 deletions(-) diff --git a/covsirphy/downloading/_db_wpp.py b/covsirphy/downloading/_db_wpp.py index 064731d8c..c8dade3d7 100644 --- a/covsirphy/downloading/_db_wpp.py +++ b/covsirphy/downloading/_db_wpp.py @@ -1,4 +1,5 @@ import pandas as pd +from pyarrow.lib import ArrowKeyError from covsirphy.util.term import Term from covsirphy.downloading._db import _DataBase @@ -39,7 +40,11 @@ def _country(self): - Population (numpy.float64): population values """ url = f"{self.TOP_URL}WPP2024_TotalPopulationBySex.csv.gz" - df = self._provide(url=url, suffix="_level1", columns=list(self.COL_DICT.keys())) + try: + df = self._provide(url=url, suffix="_level1", columns=list(self.COL_DICT.keys())) + except ArrowKeyError: + df = self._provide(url=url, suffix="_level1", columns=None) + df = df.rename(self.COL_DICT.keys()) df[self.DATE] = pd.to_datetime(df["Year"], format="%Y") + pd.offsets.DateOffset(months=6) df[self.PROVINCE] = self.NA df[self.CITY] = self.NA diff --git a/covsirphy/downloading/_provider.py b/covsirphy/downloading/_provider.py index 6653b8b70..752a506d2 100644 --- a/covsirphy/downloading/_provider.py +++ b/covsirphy/downloading/_provider.py @@ -7,7 +7,7 @@ from urllib3 import PoolManager from urllib3.util.ssl_ import create_urllib3_context import warnings -from zipfile import ZipFile +from zipfile import ZipFile,BadZipFile import numpy as np import pandas as pd from unidecode import unidecode @@ -104,10 +104,12 @@ def read_csv(path, columns, date, date_format): "header": 0, "usecols": columns, "encoding": "utf-8", "engine": "pyarrow", "parse_dates": None if date is None else [date], "date_format": date_format, } - if urlparse(path).scheme: - kwargs["storage_options"] = {"User-Agent": "Mozilla/5.0"} try: - df = pd.read_csv(path, **kwargs) + df = pd.read_csv( + path, + storage_options={"User-Agent": "Mozilla/5.0"} if urlparse(path).scheme else None, + **kwargs + ) except URLError: ctx = create_urllib3_context() ctx.load_default_certs() @@ -115,9 +117,12 @@ def read_csv(path, columns, date, date_format): ctx.options |= 0x4 with PoolManager(ssl_context=ctx) as http: r = http.request("GET", path) - with ZipFile(io.BytesIO(r.data), "r") as fh: - text = fh.read(f"{Path(path).stem}.csv") - df = pd.read_csv(io.StringIO(text.decode("utf-8"))) + try: + with ZipFile(io.BytesIO(r.data), "r") as fh: + text = fh.read(f"{Path(path).stem}.csv") + df = pd.read_csv(io.StringIO(text.decode("utf-8")), **kwargs) + except BadZipFile: + df = pd.read_csv(io.BytesIO(r.data), **kwargs) for col in df: with contextlib.suppress(TypeError): df[col] = df[col].apply(lambda x: unidecode(x) if len(x) else np.nan) From 80063a5545964f7f6d739f77bded284027e28c22 Mon Sep 17 00:00:00 2001 From: lisphilar <7270139+lisphilar@users.noreply.github.com> Date: Sun, 18 Aug 2024 17:42:11 +0900 Subject: [PATCH 04/11] update: use The World bank API v2 --- covsirphy/downloading/_db_wpp.py | 14 ++++++-------- 1 file changed, 6 insertions(+), 8 deletions(-) diff --git a/covsirphy/downloading/_db_wpp.py b/covsirphy/downloading/_db_wpp.py index c8dade3d7..638fbd077 100644 --- a/covsirphy/downloading/_db_wpp.py +++ b/covsirphy/downloading/_db_wpp.py @@ -8,8 +8,9 @@ class _WPP(_DataBase): """ Access "World Population Prospects by United nations" server. https://population.un.org/wpp/ + https://datahelpdesk.worldbank.org/knowledgebase/articles/898581-api-basic-call-structures """ - TOP_URL = "https://population.un.org/wpp/Download/Files/1_Indicators%20(Standard)/CSV_FILES/" + TOP_URL = "https://api.worldbank.org/v2/" # File title without extensions and suffix TITLE = "world-population-prospects" # Dictionary of column names @@ -39,13 +40,10 @@ def _country(self): - City (object): NAs - Population (numpy.float64): population values """ - url = f"{self.TOP_URL}WPP2024_TotalPopulationBySex.csv.gz" - try: - df = self._provide(url=url, suffix="_level1", columns=list(self.COL_DICT.keys())) - except ArrowKeyError: - df = self._provide(url=url, suffix="_level1", columns=None) - df = df.rename(self.COL_DICT.keys()) - df[self.DATE] = pd.to_datetime(df["Year"], format="%Y") + pd.offsets.DateOffset(months=6) + url = f"{self.TOP_URL}country/all/indicator/SP.POP.TOTLL?per_page=30000" + df = pd.read_xml(url, parser="etree") + df[self.DATE] = pd.to_datetime(df["date"], format="%Y") + pd.offsets.DateOffset(months=6) + df = df.rename({"countryiso3code": Term.ISO3}) df[self.PROVINCE] = self.NA df[self.CITY] = self.NA df[self.N] = df[self.N] * 1_000 From 175aaf24d86bb737c12e7ae92a88f39752599054 Mon Sep 17 00:00:00 2001 From: lisphilar <7270139+lisphilar@users.noreply.github.com> Date: Sun, 18 Aug 2024 17:44:05 +0900 Subject: [PATCH 05/11] fix: F401 --- covsirphy/downloading/_db_wpp.py | 1 - 1 file changed, 1 deletion(-) diff --git a/covsirphy/downloading/_db_wpp.py b/covsirphy/downloading/_db_wpp.py index 638fbd077..d61d8808d 100644 --- a/covsirphy/downloading/_db_wpp.py +++ b/covsirphy/downloading/_db_wpp.py @@ -1,5 +1,4 @@ import pandas as pd -from pyarrow.lib import ArrowKeyError from covsirphy.util.term import Term from covsirphy.downloading._db import _DataBase From 1cc6f74a0b24fc28690665bed045076ec639c273 Mon Sep 17 00:00:00 2001 From: lisphilar <7270139+lisphilar@users.noreply.github.com> Date: Sun, 18 Aug 2024 17:46:11 +0900 Subject: [PATCH 06/11] fix: E231 --- covsirphy/downloading/_provider.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/covsirphy/downloading/_provider.py b/covsirphy/downloading/_provider.py index 752a506d2..eb7c1cf6d 100644 --- a/covsirphy/downloading/_provider.py +++ b/covsirphy/downloading/_provider.py @@ -7,7 +7,7 @@ from urllib3 import PoolManager from urllib3.util.ssl_ import create_urllib3_context import warnings -from zipfile import ZipFile,BadZipFile +from zipfile import ZipFile, BadZipFile import numpy as np import pandas as pd from unidecode import unidecode From 54e5314c26e8c56b50aa98898d92b51986866440 Mon Sep 17 00:00:00 2001 From: lisphilar <7270139+lisphilar@users.noreply.github.com> Date: Sun, 18 Aug 2024 18:01:01 +0900 Subject: [PATCH 07/11] fix: KeyErrror --- covsirphy/downloading/_db_wpp.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/covsirphy/downloading/_db_wpp.py b/covsirphy/downloading/_db_wpp.py index d61d8808d..23e2d5586 100644 --- a/covsirphy/downloading/_db_wpp.py +++ b/covsirphy/downloading/_db_wpp.py @@ -45,7 +45,7 @@ def _country(self): df = df.rename({"countryiso3code": Term.ISO3}) df[self.PROVINCE] = self.NA df[self.CITY] = self.NA - df[self.N] = df[self.N] * 1_000 + df[self.N] = df["value"] return df.dropna(how="any").loc[:, self.ALL_COLS] def _province(self, country): From afec214548c5e582b3a42a90f03ccca2d9f824c8 Mon Sep 17 00:00:00 2001 From: lisphilar <7270139+lisphilar@users.noreply.github.com> Date: Sun, 18 Aug 2024 18:10:02 +0900 Subject: [PATCH 08/11] chores: print test --- covsirphy/downloading/_db_wpp.py | 1 + 1 file changed, 1 insertion(+) diff --git a/covsirphy/downloading/_db_wpp.py b/covsirphy/downloading/_db_wpp.py index 23e2d5586..2b092f9a9 100644 --- a/covsirphy/downloading/_db_wpp.py +++ b/covsirphy/downloading/_db_wpp.py @@ -41,6 +41,7 @@ def _country(self): """ url = f"{self.TOP_URL}country/all/indicator/SP.POP.TOTLL?per_page=30000" df = pd.read_xml(url, parser="etree") + print(df.columns) df[self.DATE] = pd.to_datetime(df["date"], format="%Y") + pd.offsets.DateOffset(months=6) df = df.rename({"countryiso3code": Term.ISO3}) df[self.PROVINCE] = self.NA From 30b8564a5769fdadfa8639126a63fe93c2650d48 Mon Sep 17 00:00:00 2001 From: lisphilar <7270139+lisphilar@users.noreply.github.com> Date: Sun, 18 Aug 2024 18:17:08 +0900 Subject: [PATCH 09/11] fix: error in URL --- covsirphy/downloading/_db_wpp.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/covsirphy/downloading/_db_wpp.py b/covsirphy/downloading/_db_wpp.py index 2b092f9a9..cfc4d5965 100644 --- a/covsirphy/downloading/_db_wpp.py +++ b/covsirphy/downloading/_db_wpp.py @@ -39,9 +39,8 @@ def _country(self): - City (object): NAs - Population (numpy.float64): population values """ - url = f"{self.TOP_URL}country/all/indicator/SP.POP.TOTLL?per_page=30000" + url = f"{self.TOP_URL}country/all/indicator/SP.POP.TOTL?per_page=20000" df = pd.read_xml(url, parser="etree") - print(df.columns) df[self.DATE] = pd.to_datetime(df["date"], format="%Y") + pd.offsets.DateOffset(months=6) df = df.rename({"countryiso3code": Term.ISO3}) df[self.PROVINCE] = self.NA From 662d1a46fd39cf8ede84c04ab58530bd208213c6 Mon Sep 17 00:00:00 2001 From: lisphilar <7270139+lisphilar@users.noreply.github.com> Date: Sun, 18 Aug 2024 18:26:24 +0900 Subject: [PATCH 10/11] fix: KeyError --- covsirphy/downloading/_db_wpp.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/covsirphy/downloading/_db_wpp.py b/covsirphy/downloading/_db_wpp.py index cfc4d5965..f26431dc3 100644 --- a/covsirphy/downloading/_db_wpp.py +++ b/covsirphy/downloading/_db_wpp.py @@ -42,7 +42,7 @@ def _country(self): url = f"{self.TOP_URL}country/all/indicator/SP.POP.TOTL?per_page=20000" df = pd.read_xml(url, parser="etree") df[self.DATE] = pd.to_datetime(df["date"], format="%Y") + pd.offsets.DateOffset(months=6) - df = df.rename({"countryiso3code": Term.ISO3}) + df = df.rename(columns={"countryiso3code": Term.ISO3}) df[self.PROVINCE] = self.NA df[self.CITY] = self.NA df[self.N] = df["value"] From 60bc568cecfe367cc69385de48bc6029b1ba0cc5 Mon Sep 17 00:00:00 2001 From: lisphilar <7270139+lisphilar@users.noreply.github.com> Date: Sun, 18 Aug 2024 18:43:25 +0900 Subject: [PATCH 11/11] fix: SubsetNotFoundError --- covsirphy/downloading/_db_wpp.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/covsirphy/downloading/_db_wpp.py b/covsirphy/downloading/_db_wpp.py index f26431dc3..1f3edf9fe 100644 --- a/covsirphy/downloading/_db_wpp.py +++ b/covsirphy/downloading/_db_wpp.py @@ -46,7 +46,7 @@ def _country(self): df[self.PROVINCE] = self.NA df[self.CITY] = self.NA df[self.N] = df["value"] - return df.dropna(how="any").loc[:, self.ALL_COLS] + return df.loc[:, self.ALL_COLS].dropna(how="any") def _province(self, country): """Returns province-level data.