From 5ee55b2b3f1a26ded1ce1bd8bf54836febebebf4 Mon Sep 17 00:00:00 2001 From: KameniAlexNea Date: Sun, 28 Apr 2024 17:23:32 +0200 Subject: [PATCH 1/4] feat(#8): implement a wrapper to read data-gouv website and download file --- etl/requirements.txt | 2 ++ etl/update_database.py | 41 +++++++++++++++++++++++++++++++++++++++++ 2 files changed, 43 insertions(+) create mode 100644 etl/update_database.py diff --git a/etl/requirements.txt b/etl/requirements.txt index 6100c48..0529966 100644 --- a/etl/requirements.txt +++ b/etl/requirements.txt @@ -28,3 +28,5 @@ chainlit==0.5.1 tornado>=6.3.3 # not directly required, pinned by Snyk to avoid a vulnerability aiohttp>=3.9.0 # not directly required, pinned by Snyk to avoid a vulnerability sentry_sdk==1.39.1 + +beautifulsoup4==4.12.3 \ No newline at end of file diff --git a/etl/update_database.py b/etl/update_database.py new file mode 100644 index 0000000..24a7666 --- /dev/null +++ b/etl/update_database.py @@ -0,0 +1,41 @@ +import os +import requests +from bs4 import BeautifulSoup + +DATA_GOUV_PATH = "https://www.data.gouv.fr/fr/datasets/repertoire-national-des-associations/" + +def read_data_gouv_page(): + headers = {'User-Agent': None} + response = requests.get(DATA_GOUV_PATH, headers=headers) + if 200 <= response.status_code <= 300: + return response.content + raise Exception(response.content) + +def download_link(url: str, headers=None): + if url.endswith("download") or url.endswith((".pdf", ".docx", ".zip", ".exe", ".jpg", ".png")): + response = requests.get(url, headers=headers) + if (200 <= response.status_code <= 300): + name = os.path.basename(url) + with open(name, "wb") as file: + file.write(response.content) + return name + +def search_and_download_data(): + page = read_data_gouv_page() + soup = BeautifulSoup(page, 'html.parser') + links = soup.find_all('a', href=True) + links: list[str] = [ + i["href"] for i in links if ("media.interieur.gouv" in i["href"]) + ] + rna_import = [i for i in links if "rna_import" in i] + rna_waldec = [i for i in links if "rna_waldec" in i] + + rna_import = sorted(rna_import, reverse=True)[0] + rna_waldec = sorted(rna_waldec, reverse=True)[0] + + rna_import = download_link(rna_import) + rna_waldec = download_link(rna_waldec) + return rna_import, rna_waldec + +if __name__ == "__main__": + search_and_download_data() \ No newline at end of file From 96a0fe51ca995399befbfb51aeafa23ec4b3bf19 Mon Sep 17 00:00:00 2001 From: KameniAlexNea Date: Sun, 20 Oct 2024 14:04:39 +0200 Subject: [PATCH 2/4] feat(#8): implement a wrapper to read data-gouv website and download file --- etl/filter-cameroon.py | 43 +++++++++++++++++++----------------------- etl/update_database.py | 27 +++++++++++++++++++++----- 2 files changed, 41 insertions(+), 29 deletions(-) diff --git a/etl/filter-cameroon.py b/etl/filter-cameroon.py index 8bd5df4..8d6f508 100644 --- a/etl/filter-cameroon.py +++ b/etl/filter-cameroon.py @@ -1,4 +1,4 @@ -# %% + # CSV Files downloaded from https://www.data.gouv.fr/fr/datasets/repertoire-national-des-associations/ Fichier RNA Waldec du 01 Mars 2022 import datetime as dt import glob @@ -15,10 +15,15 @@ from lambdaprompt import GPT3Prompt from pandarallel import pandarallel from rich.console import Console +from argparse import Namespace, ArgumentParser + +parser = ArgumentParser() +parser.add_argument("--rna_folder", default="rna_waldec_20220301/") + +args, _ = parser.parse_known_args() -# %% start = time.time() -file_location = os.getcwd() + "/rna_waldec_20220301/" +file_location = os.path.join(os.getcwd(), args.rna_folder) all_files = glob.glob(os.path.join(file_location, "*.csv")) columns = [ @@ -42,9 +47,10 @@ f, delimiter=";", header=0, - encoding="ISO-8859-1", + # encoding="ISO-8859-1", usecols=columns, engine="c", + low_memory=False ) for f in all_files ], @@ -54,7 +60,7 @@ end = time.time() print(f"Time to read all CSV : {dt.timedelta(seconds=end - start)}") -# %% + ssm = boto3.client("ssm", region_name="eu-central-1") openai.api_key = ssm.get_parameter( @@ -65,7 +71,7 @@ os.environ["OPENAI_API_KEY"] = openai.api_key -# %% + start = time.time() @@ -138,7 +144,7 @@ def select_relevant_columns(df): end = time.time() print(f"Time to Filter Rows : {dt.timedelta(seconds=end - start)}") -# %% + text_prompt = """ Normalize the addresses in french. Don't ignore any lines and treat each address separetely and go step by step @@ -223,17 +229,6 @@ def select_relevant_columns(df): all_adresses = [x.strip() for x in all_adresses] # Build adresse by concatenation -df2["adrs"] = ( - df2["adrs_numvoie"].map(str) - + " " - + df2["adrs_typevoie"].map(str) - + " " - + df2["adrs_libvoie"].map(str) - + " " - + df2["adrs_codepostal"].map(str) - + " " - + df2["adrs_libcommune"].map(str) -) df_cameroon_associations["adrs"] = ( df_cameroon_associations["adrs_numvoie"].map(str) + " " @@ -258,7 +253,7 @@ def select_relevant_columns(df): ] print(f"{len(df_not_in_cache)} adresses not present in cache...") -# %% + if len(df_not_in_cache) > 0: num_batches = int(np.ceil(len(df_not_in_cache) / 25)) batches = np.array_split(df_not_in_cache, num_batches) @@ -280,7 +275,7 @@ def select_relevant_columns(df): time.sleep(120) batch["adrs"] = cache[list_adresses] -# %% + # Downloaded from https://download.geonames.org/export/zip/ region_by_postal_codes = pd.read_csv( "code-postal-geonames.tsv", delimiter="\t", index_col=1 @@ -316,7 +311,7 @@ def select_relevant_columns(df): waldec_csv[40580] = "ACTIVTÉS RELIGIEUSES, SPIRITUELLES OU PHILOSOPHIQUES" -# %% + def get_dept_region(code_postal): try: @@ -372,7 +367,7 @@ def add_social_object_libelle(df): # get_info("W212001727") # get_dept_region(30913) -# %% + pandarallel.initialize(progress_bar=True) requests_cache.install_cache("geocode_cache") @@ -428,7 +423,7 @@ def format_libelle_for_gogocarto(df): format_libelle_for_gogocarto ) -# %% + def remove_space_at_the_end(x: str): @@ -463,7 +458,7 @@ def normalize_final(data: pd.DataFrame): df_cameroon_associations = df_cameroon_associations.pipe(normalize_final) -# %% + df_cameroon_associations.to_csv("rna-real-mars-2022-new.csv") diff --git a/etl/update_database.py b/etl/update_database.py index 24a7666..864aa57 100644 --- a/etl/update_database.py +++ b/etl/update_database.py @@ -1,6 +1,9 @@ import os import requests from bs4 import BeautifulSoup +from zipfile import ZipFile +import sys +import runpy DATA_GOUV_PATH = "https://www.data.gouv.fr/fr/datasets/repertoire-national-des-associations/" @@ -19,6 +22,12 @@ def download_link(url: str, headers=None): with open(name, "wb") as file: file.write(response.content) return name + +def unzip_and_delete(path: str): + zipped = ZipFile(path) + zipped.extractall(path.replace(".zip", "")) + zipped.close() + return path.replace(".zip", "") def search_and_download_data(): page = read_data_gouv_page() @@ -27,15 +36,23 @@ def search_and_download_data(): links: list[str] = [ i["href"] for i in links if ("media.interieur.gouv" in i["href"]) ] - rna_import = [i for i in links if "rna_import" in i] + # rna_import = [i for i in links if "rna_import" in i] rna_waldec = [i for i in links if "rna_waldec" in i] - rna_import = sorted(rna_import, reverse=True)[0] + # rna_import = sorted(rna_import, reverse=True)[0] rna_waldec = sorted(rna_waldec, reverse=True)[0] - rna_import = download_link(rna_import) + # rna_import = download_link(rna_import) rna_waldec = download_link(rna_waldec) - return rna_import, rna_waldec + return rna_waldec if __name__ == "__main__": - search_and_download_data() \ No newline at end of file + print("Searching for lastest rna waldec version") + path = search_and_download_data() + folder = path.replace(".zip", "") + print("extracting rna data") + unzip_and_delete(path) + print("delete zip file") + os.remove(path) + folder = "rna_waldec_20241001" + os.system(f"secretsfoundry run --script 'python filter-cameroon.py --rna_folder {folder}'") \ No newline at end of file From 8f431149ffdfe4eeb1a9b53a8944f336c62d57f8 Mon Sep 17 00:00:00 2001 From: KameniAlexNea Date: Sun, 20 Oct 2024 14:10:51 +0200 Subject: [PATCH 3/4] feat(#8): implement a wrapper to read data-gouv website and download file --- etl/filter-cameroon.py | 7 ------- etl/update_database.py | 14 +++++++++----- 2 files changed, 9 insertions(+), 12 deletions(-) diff --git a/etl/filter-cameroon.py b/etl/filter-cameroon.py index 8d6f508..b664033 100644 --- a/etl/filter-cameroon.py +++ b/etl/filter-cameroon.py @@ -71,7 +71,6 @@ os.environ["OPENAI_API_KEY"] = openai.api_key - start = time.time() @@ -136,7 +135,6 @@ def select_relevant_columns(df): ] - df_cameroon_associations = ( df_associations.pipe(filter_cameroon).pipe(remove_closed).pipe(normalize) ) @@ -311,8 +309,6 @@ def select_relevant_columns(df): waldec_csv[40580] = "ACTIVTÉS RELIGIEUSES, SPIRITUELLES OU PHILOSOPHIQUES" - - def get_dept_region(code_postal): try: dept = dept_by_postal_codes[str(code_postal)] @@ -424,8 +420,6 @@ def format_libelle_for_gogocarto(df): ) - - def remove_space_at_the_end(x: str): if x is not None: return x.strip() @@ -458,7 +452,6 @@ def normalize_final(data: pd.DataFrame): df_cameroon_associations = df_cameroon_associations.pipe(normalize_final) - df_cameroon_associations.to_csv("rna-real-mars-2022-new.csv") diff --git a/etl/update_database.py b/etl/update_database.py index 864aa57..bf48c2c 100644 --- a/etl/update_database.py +++ b/etl/update_database.py @@ -2,11 +2,10 @@ import requests from bs4 import BeautifulSoup from zipfile import ZipFile -import sys -import runpy DATA_GOUV_PATH = "https://www.data.gouv.fr/fr/datasets/repertoire-national-des-associations/" + def read_data_gouv_page(): headers = {'User-Agent': None} response = requests.get(DATA_GOUV_PATH, headers=headers) @@ -14,21 +13,24 @@ def read_data_gouv_page(): return response.content raise Exception(response.content) + def download_link(url: str, headers=None): if url.endswith("download") or url.endswith((".pdf", ".docx", ".zip", ".exe", ".jpg", ".png")): response = requests.get(url, headers=headers) - if (200 <= response.status_code <= 300): + if (200 <= response.status_code <= 300): name = os.path.basename(url) with open(name, "wb") as file: file.write(response.content) return name - + + def unzip_and_delete(path: str): zipped = ZipFile(path) zipped.extractall(path.replace(".zip", "")) zipped.close() return path.replace(".zip", "") + def search_and_download_data(): page = read_data_gouv_page() soup = BeautifulSoup(page, 'html.parser') @@ -46,6 +48,7 @@ def search_and_download_data(): rna_waldec = download_link(rna_waldec) return rna_waldec + if __name__ == "__main__": print("Searching for lastest rna waldec version") path = search_and_download_data() @@ -55,4 +58,5 @@ def search_and_download_data(): print("delete zip file") os.remove(path) folder = "rna_waldec_20241001" - os.system(f"secretsfoundry run --script 'python filter-cameroon.py --rna_folder {folder}'") \ No newline at end of file + os.system( + f"secretsfoundry run --script 'python filter-cameroon.py --rna_folder {folder}'") From 55ff7937fbee26431a0af53ba8a5824e6461e07d Mon Sep 17 00:00:00 2001 From: KameniAlexNea Date: Sun, 20 Oct 2024 14:15:44 +0200 Subject: [PATCH 4/4] feat(#8): implement a wrapper to read data-gouv website and download file --- etl/requirements.txt | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/etl/requirements.txt b/etl/requirements.txt index 0529966..23cc884 100644 --- a/etl/requirements.txt +++ b/etl/requirements.txt @@ -28,5 +28,4 @@ chainlit==0.5.1 tornado>=6.3.3 # not directly required, pinned by Snyk to avoid a vulnerability aiohttp>=3.9.0 # not directly required, pinned by Snyk to avoid a vulnerability sentry_sdk==1.39.1 - -beautifulsoup4==4.12.3 \ No newline at end of file +beautifulsoup4==4.12.3