-
Notifications
You must be signed in to change notification settings - Fork 4
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #514 from Open-Earth-Foundation/ON-1661_Industrial…
…_Products_Statistics Industrial Products Statistics - Cleaned Raw Data
- Loading branch information
Showing
3 changed files
with
99 additions
and
0 deletions.
There are no files selected for viewing
24 changes: 24 additions & 0 deletions
24
global-api/importer/argentinian_datasets/indec_industrial_products/README.md
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,24 @@ | ||
# Industrial Products Statistics - INDEC | ||
|
||
This program allows obtaining local production series of selected industrial goods, in physical units. | ||
The information originates from different sources: INDEC's own surveys, data from other government agencies and information from business institutions. In the case of some products (wine, beer, soft drinks, cigarettes, cement, boats), to make up for the lack of production statistics or to complement them, figures for registrations, sales or shipments of national products are recorded. | ||
|
||
1. Extract the raw data from the source [INDEC](https://www.indec.gob.ar/indec/web/Nivel4-Tema-3-6-18): | ||
```bash | ||
python ./extraction.py | ||
``` | ||
2. Clean the raw data: | ||
```bash | ||
python ./clean_raw_data.py | ||
``` | ||
3. Load the cleaned raw data into a new table: | ||
[....] | ||
|
||
### Directory tree | ||
```sh | ||
. | ||
├── README.md # top level readme | ||
├── extraction.py # extraction script | ||
├── clean_raw_data.py # transformation script | ||
└── loading_raw_data.py # loading script | ||
``` |
46 changes: 46 additions & 0 deletions
46
global-api/importer/argentinian_datasets/indec_industrial_products/clean_raw_data.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,46 @@ | ||
import pandas as pd | ||
|
||
# import data | ||
df = pd.read_csv("./industrial_products_statistics.csv") | ||
|
||
# Select rows where 'Unnamed: 4' is 'l' | ||
condition = df["Unnamed: 4"] == "l" | ||
|
||
# Update 'activity_name' and 'activity_units' columns | ||
df.loc[condition, "activity_name"] = ( | ||
df.loc[condition, "activity_name"] + " - " + df.loc[condition, "activity_value"] | ||
) | ||
df.loc[condition, "activity_value"] = df.loc[condition, "activity_units"] | ||
df.loc[condition, "activity_units"] = "l" | ||
|
||
# Select rows where 'Unnamed: 4' is 'l' | ||
condition = df["Unnamed: 4"] == "square meters" | ||
|
||
# Update 'activity_name' and 'activity_units' columns | ||
df.loc[condition, "activity_name"] = ( | ||
df.loc[condition, "activity_name"] + " - " + df.loc[condition, "activity_value"] | ||
) | ||
df.loc[condition, "activity_value"] = df.loc[condition, "activity_units"] | ||
df.loc[condition, "activity_units"] = "square meters" | ||
|
||
# Delete 'Unnamed: 4' column | ||
df = df.drop(columns=["Unnamed: 4"]) | ||
|
||
# Rename units to standardized nomenclature | ||
df["activity_units"] = df["activity_units"].replace( | ||
{"tonnes": "t", "cubic meters": "m3", "square meters": "m2"} | ||
) | ||
|
||
# Assign "GPC_refno" for the sub-sector without scope | ||
# note: the scope is not assign because it depends of the city treatment process, e.g if the wastewater is treat in/outside the city | ||
df["GPC_refno"] = "III.4" | ||
|
||
# assign actor information | ||
df["actor_id"] = "AR" | ||
df["actor_name"] = "Argentina" | ||
|
||
# delete 'Unnamed: 0' column | ||
df = df.drop(columns=["Unnamed: 0"]) | ||
|
||
# Export the df as csv file | ||
df.to_csv("./cleaned_industrial_products_statistics.csv") |
29 changes: 29 additions & 0 deletions
29
global-api/importer/argentinian_datasets/indec_industrial_products/extraction.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,29 @@ | ||
import requests | ||
import os | ||
|
||
link = 'https://www.indec.gob.ar/ftp/cuadros/economia/cuadros_epi_03_23.xls' | ||
|
||
def download_files(links, download_path): | ||
full_download_path = os.path.expanduser(download_path) | ||
os.makedirs(full_download_path, exist_ok=True) | ||
|
||
try: | ||
response = requests.get(link) | ||
# Check if the request was successful (status code 200) | ||
if response.status_code == 200: | ||
file_name = f"raw_GHG_Factors_for_International_Grid_Electricity.xlsx" | ||
# Construct the complete file path | ||
file_path = os.path.join(full_download_path, file_name) | ||
# Save the file | ||
with open(file_path, 'wb') as file: | ||
file.write(response.content) | ||
print(f"Downloaded: {file_name}")s | ||
else: | ||
print(f"Failed to download {link} (Status code: {response.status_code})") | ||
except Exception as e: | ||
print(f"Error downloading {link} data: {e}") | ||
|
||
# Use the specified download_path | ||
download_files(link, download_path='./') | ||
|
||
|