From 1b9fa7de0d066f677c61b138908432afb33efacb Mon Sep 17 00:00:00 2001 From: mfonsecaOEF Date: Thu, 6 Jun 2024 11:10:01 -0600 Subject: [PATCH 1/3] feat:download_raw_data --- .../indec_industrial_products/extraction.py | 29 +++++++++++++++++++ 1 file changed, 29 insertions(+) create mode 100644 global-api/importer/argentinian_datasets/indec_industrial_products/extraction.py diff --git a/global-api/importer/argentinian_datasets/indec_industrial_products/extraction.py b/global-api/importer/argentinian_datasets/indec_industrial_products/extraction.py new file mode 100644 index 000000000..3f95f8537 --- /dev/null +++ b/global-api/importer/argentinian_datasets/indec_industrial_products/extraction.py @@ -0,0 +1,29 @@ +import requests +import os + +link = 'https://www.indec.gob.ar/ftp/cuadros/economia/cuadros_epi_03_23.xls' + +def download_files(links, download_path): + full_download_path = os.path.expanduser(download_path) + os.makedirs(full_download_path, exist_ok=True) + + try: + response = requests.get(link) + # Check if the request was successful (status code 200) + if response.status_code == 200: + file_name = f"raw_GHG_Factors_for_International_Grid_Electricity.xlsx" + # Construct the complete file path + file_path = os.path.join(full_download_path, file_name) + # Save the file + with open(file_path, 'wb') as file: + file.write(response.content) + print(f"Downloaded: {file_name}")s + else: + print(f"Failed to download {link} (Status code: {response.status_code})") + except Exception as e: + print(f"Error downloading {link} data: {e}") + +# Use the specified download_path +download_files(link, download_path='./') + + From 72f6153ff484d64748c5cdd44fbee4371b11d1a9 Mon Sep 17 00:00:00 2001 From: mfonsecaOEF Date: Thu, 6 Jun 2024 11:10:48 -0600 Subject: [PATCH 2/3] feat:cleaning_process --- .../clean_raw_data.py | 46 +++++++++++++++++++ 1 file changed, 46 insertions(+) create mode 100644 global-api/importer/argentinian_datasets/indec_industrial_products/clean_raw_data.py diff --git a/global-api/importer/argentinian_datasets/indec_industrial_products/clean_raw_data.py b/global-api/importer/argentinian_datasets/indec_industrial_products/clean_raw_data.py new file mode 100644 index 000000000..0b2c58e01 --- /dev/null +++ b/global-api/importer/argentinian_datasets/indec_industrial_products/clean_raw_data.py @@ -0,0 +1,46 @@ +import pandas as pd + +# import data +df = pd.read_csv("./industrial_products_statistics.csv") + +# Select rows where 'Unnamed: 4' is 'l' +condition = df["Unnamed: 4"] == "l" + +# Update 'activity_name' and 'activity_units' columns +df.loc[condition, "activity_name"] = ( + df.loc[condition, "activity_name"] + " - " + df.loc[condition, "activity_value"] +) +df.loc[condition, "activity_value"] = df.loc[condition, "activity_units"] +df.loc[condition, "activity_units"] = "l" + +# Select rows where 'Unnamed: 4' is 'l' +condition = df["Unnamed: 4"] == "square meters" + +# Update 'activity_name' and 'activity_units' columns +df.loc[condition, "activity_name"] = ( + df.loc[condition, "activity_name"] + " - " + df.loc[condition, "activity_value"] +) +df.loc[condition, "activity_value"] = df.loc[condition, "activity_units"] +df.loc[condition, "activity_units"] = "square meters" + +# Delete 'Unnamed: 4' column +df = df.drop(columns=["Unnamed: 4"]) + +# Rename units to standardized nomenclature +df["activity_units"] = df["activity_units"].replace( + {"tonnes": "t", "cubic meters": "m3", "square meters": "m2"} +) + +# Assign "GPC_refno" for the sub-sector without scope +# note: the scope is not assign because it depends of the city treatment process, e.g if the wastewater is treat in/outside the city +df["GPC_refno"] = "III.4" + +# assign actor information +df["actor_id"] = "AR" +df["actor_name"] = "Argentina" + +# delete 'Unnamed: 0' column +df = df.drop(columns=["Unnamed: 0"]) + +# Export the df as csv file +df.to_csv("./cleaned_industrial_products_statistics.csv") From a707db8c49f6110935bb7e303d7c508dbe1e3def Mon Sep 17 00:00:00 2001 From: mfonsecaOEF Date: Thu, 6 Jun 2024 11:11:14 -0600 Subject: [PATCH 3/3] doc:metadata --- .../indec_industrial_products/README.md | 24 +++++++++++++++++++ 1 file changed, 24 insertions(+) create mode 100644 global-api/importer/argentinian_datasets/indec_industrial_products/README.md diff --git a/global-api/importer/argentinian_datasets/indec_industrial_products/README.md b/global-api/importer/argentinian_datasets/indec_industrial_products/README.md new file mode 100644 index 000000000..13c9349a6 --- /dev/null +++ b/global-api/importer/argentinian_datasets/indec_industrial_products/README.md @@ -0,0 +1,24 @@ +# Industrial Products Statistics - INDEC + +This program allows obtaining local production series of selected industrial goods, in physical units. +The information originates from different sources: INDEC's own surveys, data from other government agencies and information from business institutions. In the case of some products (wine, beer, soft drinks, cigarettes, cement, boats), to make up for the lack of production statistics or to complement them, figures for registrations, sales or shipments of national products are recorded. + +1. Extract the raw data from the source [INDEC](https://www.indec.gob.ar/indec/web/Nivel4-Tema-3-6-18): +```bash +python ./extraction.py +``` +2. Clean the raw data: +```bash +python ./clean_raw_data.py +``` +3. Load the cleaned raw data into a new table: +[....] + +### Directory tree +```sh +. +├── README.md # top level readme +├── extraction.py # extraction script +├── clean_raw_data.py # transformation script +└── loading_raw_data.py # loading script +``` \ No newline at end of file