Merge pull request #473 from Open-Earth-Foundation/ON-1656_CAMMESA_AR

CAMMESA_AR integration
Open-Earth-Foundation · Apr 30, 2024 · aedd0fc · aedd0fc
2 parents f04500a + 154d826
commit aedd0fc
Show file tree

Hide file tree

Showing 8 changed files with 423 additions and 115 deletions.
diff --git a/global-api/import_argentiniandatasets.sh b/global-api/import_argentiniandatasets.sh
@@ -7,12 +7,7 @@ else
 fi
 
 export PGPASSWORD=$CC_GLOBAL_API_DB_PASSWORD
-export DB_URI="postgresql://$CC_GLOBAL_API_DB_USER:$CC_GLOBAL_API_DB_PASSWORD@$CC_GLOBAL_API_DB_HOST/$CC_GLOBAL_API_DB_NAME"
-
-# export DB_URI="postgresql://ccglobal:@localhost/ccglobal"
-# export CC_GLOBAL_API_DB_HOST="localhost" 
-# export CC_GLOBAL_API_DB_USER="ccglobal" 
-# export CC_GLOBAL_API_DB_NAME="ccglobal" 
+export DB_URI="postgresql://$CC_GLOBAL_API_DB_USER:$CC_GLOBAL_API_DB_PASSWORD@$CC_GLOBAL_API_DB_HOST/$CC_GLOBAL_API_DB_NAME" 
 
 # Argentinian
 pushd importer/argentinian_datasets/BEN/
@@ -47,4 +42,26 @@ psql -h $CC_GLOBAL_API_DB_HOST \
     -d $CC_GLOBAL_API_DB_NAME \
     -f load_SESCO.sql
 
+popd
+
+# load cammesa
+
+pushd importer/argentinian_datasets/cammesa/
+
+$python_cmd ./transformation_cammesa.py --filepath ./ --database_uri $DB_URI 
+
+psql -h $CC_GLOBAL_API_DB_HOST \
+    -U $CC_GLOBAL_API_DB_USER \
+    -d $CC_GLOBAL_API_DB_NAME \
+    -f load_cammesa.sql
+
+popd
+
+# Import datasources
+
+pushd importer/datasource_seeder
+psql -h $CC_GLOBAL_API_DB_HOST \
+   -U $CC_GLOBAL_API_DB_USER \
+   -d $CC_GLOBAL_API_DB_NAME \
+   -f ./import_datasource_seeder.sql
 popd
diff --git a/global-api/importer/argentinian_datasets/cammesa/README b/global-api/importer/argentinian_datasets/cammesa/README
diff --git a/global-api/importer/argentinian_datasets/cammesa/README.md b/global-api/importer/argentinian_datasets/cammesa/README.md
@@ -0,0 +1,21 @@
+# cammesa - Argentina
+Local data of energy generation by power plants in Argentina. This source is used to calculate GHG emissions for subsector of Energy industries in Stationary Energy sector (I.4.4).
+
+1. Extract the activity data from the source [cammesa](https://cammesaweb.cammesa.com/download/factor-de-emision/)
+
+2. Transform the activity into emission data align with the Global API schema:
+```bash
+python ./importer/argentinian_datasets/cammesa/transformation_cammesa.py --filepath [path where the transformed data will be saved]
+```
+3. Extract the activity row from the source:
+```bash
+psql -U ccglobal -d ccglobal -f ./importer/argentinian_datasets/cammesa/loading_cammesa.sql
+```
+
+### Directory tree
+```sh
+.
+├── README.md                     # top level readme
+├── transformation_cammesa.py     # transformation script
+└── load_cammesa.sql              # loading script    
+```
diff --git a/global-api/importer/argentinian_datasets/cammesa/load_cammesa.sql b/global-api/importer/argentinian_datasets/cammesa/load_cammesa.sql
@@ -0,0 +1,30 @@
+-- The ID column is not unique based on the processed records, 
+-- we have multiple acitivty records for single region_code, year, gas_name, GPC_refno
+-- rather than upsert we will just delete existing source data and insert fresh with generated id to make record unique
+-- the route for regions will need to be aggregated over region_code, year, gas_name, GPC_refno to get accurate emissions values
+DELETE FROM regionwide_emissions WHERE source_name = 'cammesa';
+
+-- Update the main table with the staging table
+INSERT INTO regionwide_emissions (
+		id,source_name,"GPC_refno",region_name,region_code,temporal_granularity,year,activity_name,activity_value,
+		activity_units,gas_name,emission_factor_value,emission_factor_units,emissions_value,emissions_units
+		)
+SELECT 	gen_random_uuid() as id,
+		source_name,
+		"GPC_refno",
+		region_name,
+		region_code,
+		temporal_granularity,
+		year,
+		activity_name,
+		activity_value,
+		activity_units,
+		gas_name,
+		emission_factor_value,
+		emission_factor_units,
+		emissions_value,
+		emissions_units
+FROM cammesa_region_emissions_staging;
+
+-- Drop the staging table
+DROP TABLE cammesa_region_emissions_staging;
diff --git a/global-api/importer/argentinian_datasets/cammesa/processed_cammesa_AR.csv b/global-api/importer/argentinian_datasets/cammesa/processed_cammesa_AR.csv
diff --git a/...api/importer/argentinian_datasets/cammesa/raw_cammesa_monthly_electricity_generation.xlsx b/...api/importer/argentinian_datasets/cammesa/raw_cammesa_monthly_electricity_generation.xlsx
diff --git a/global-api/importer/argentinian_datasets/cammesa/transformation_cammesa.py b/global-api/importer/argentinian_datasets/cammesa/transformation_cammesa.py
@@ -2,33 +2,137 @@
 import argparse
 import uuid
 import os
-import duckdb
-
-
-#--------------------------------------------------------------------------
-    # Pre Process
-#--------------------------------------------------------------------------
-
-con = duckdb.connect()
-con.install_extension("spatial")
-con.load_extension("spatial")
-
-df = con.execute("""SELECT  Field1 AS Year,
-                        Field2 AS Month,
-                        Field3 AS Machine,
-                        Field4 AS Center,
-                        Field5 AS Agent,
-                        Field6 AS Agent_Desc,
-                        Field7 AS Region,
-                        Field8 AS Provence,
-                        Field9 AS Country,
-                        Field10 AS Machine_Type,
-                        Field11 AS Source_Generation,
-                        Field12 AS Technology,
-                        Field13 AS Hydraulic_Category,
-                        Field14 AS Category_Region,
-                        Field15 AS Net_Generation_MWh
-                FROM ST_read("raw_cammesa_monthly_electricity_generation.xlsx") WHERE Field1 IS NOT NULL OFFSET 13""").df()
-
-# Close the connection
-con.close()
+from sqlalchemy import create_engine
+
+def uuid_generate_v3(name, namespace=uuid.NAMESPACE_OID):
+    """generate a version 3 UUID from namespace and name"""
+    assert isinstance(name, str), "name needs to be a string"
+    assert isinstance(namespace, uuid.UUID), "namespace needs to be a uuid.UUID"
+    return str(uuid.uuid3(namespace, name))
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--filepath", help="path to the files location", required=True)
+    parser.add_argument(
+        "--database_uri",
+        help="database URI (e.g. postgresql://ccglobal:@localhost/ccglobal)",
+        default=os.environ.get("DB_URI"),
+    )
+    args = parser.parse_args()
+    absolute_path = os.path.abspath(args.filepath)
+
+    # read the raw data
+    raw_data = f'{absolute_path}/raw_cammesa_monthly_electricity_generation.xlsx'
+    df = pd.read_excel(raw_data)
+
+    #--------------------------------------------------------------------------
+    # Pre-Process
+    #--------------------------------------------------------------------------
+
+    # assign column names
+    df.columns = df.loc[57]
+
+    # cleaning the df
+    df= df[58:]
+    df = df.reset_index(drop=True)
+
+    # select specific columns
+    df = df[['AÑO','PROVINCIA','FUENTE GENERACION','COMBUSTIBLE','CONSUMO', 'Factor CO2 por Combustible','EMISIÓN [Ton CO2]']]
+
+    # Calculate annual values
+    df = df.groupby(['AÑO', 'PROVINCIA', 'FUENTE GENERACION', 'COMBUSTIBLE', 'Factor CO2 por Combustible'])[['CONSUMO', 'EMISIÓN [Ton CO2]']].sum().reset_index()
+
+    # rename columns
+    df = df.rename(columns={
+        'AÑO': 'year',
+        'PROVINCIA': 'region_name',
+        'FUENTE GENERACION': 'source_generation',
+        'COMBUSTIBLE': 'fuel',
+        'Factor CO2 por Combustible': 'emission_factor_value',
+        'CONSUMO': 'activity_value',
+        'EMISIÓN [Ton CO2]': 'emissions_value'
+    })
+
+    # translation of generation sources
+    generation_source_dict = {
+        'Renovable': 'renewable',
+        'Térmica': 'thermal'
+    }
+    df['source_generation'] = df['source_generation'].replace(generation_source_dict)
+
+    # translation of fuel types
+    fuel_dict = {
+        'GAS NATURAL': 'natural gas',
+        'CARBÓN MINERAL': 'mineral coal',
+        'FUEL OIL': 'fuel oil',
+        'GAS OIL': 'gas oil'
+    }
+    df['fuel'] = df['fuel'].replace(fuel_dict)
+
+    # create a activity name column
+    df['activity_name'] = df['fuel'] + ' combustion consumption for energy generation from ' + df['source_generation'] + ' plants'
+
+    # convert tonnes to kg
+    df['emissions_value'] *= 1000 
+
+    # change province name
+    df['region_name'] = df['region_name'].replace('SGO.DEL ESTERO', 'SANTIAGO DEL ESTERO')
+
+    # assigning province CODE based on the province name
+    region_code_dic = {
+        'BUENOS AIRES':'AR-B', 
+        'CATAMARCA':'AR-K', 
+        'CHUBUT':'AR-U',
+        'CORDOBA':'AR-X', 
+        'CORRIENTES':'AR-W', 
+        'ENTRE RIOS':'AR-E', 
+        'JUJUY':'AR-Y', 
+        'LA PAMPA':'AR-L',
+        'LA RIOJA':'AR-F', 
+        'MENDOZA':'AR-M', 
+        'NEUQUEN':'AR-Q', 
+        'RIO NEGRO':'AR-R', 
+        'SALTA':'AR-A', 
+        'SAN JUAN':'AR-J',
+        'SAN LUIS':'AR-D', 
+        'SANTA CRUZ':'AR-Z', 
+        'SANTA FE':'AR-S', 
+        'SANTIAGO DEL ESTERO':'AR-G',
+        'TIERRA DEL FUEGO':'AR-V', 
+        'TUCUMAN':'AR-T',
+        'MISIONES': 'AR-N',
+        'FORMOSA': 'AR-P',
+        'CHACO': 'AR-H'
+    }
+    df['region_code'] = df['region_name'].map(region_code_dic)
+
+    df = df.drop(columns=['source_generation', 'fuel'])
+
+    df.loc[:, 'emission_factor_units'] = 'tonne/tonne'
+    df.loc[:, 'activity_units'] = 'tonne'
+    df.loc[:, 'emissions_units'] = 'kg'
+    df.loc[:, 'source_name'] = 'CAMMESA'
+    df.loc[:, 'temporal_granularity'] = 'annual'
+    df.loc[:, 'gas_name'] = 'CO2'
+    df.loc[:, 'GPC_refno'] = 'I.4.4'
+
+
+    # Define a function to generate UUID for each row
+    def generate_uuid(row):
+        id_string = str(row['region_code']) + str(row['emissions_value']) + str(row['year']) + str(row['gas_name']) + str(row['GPC_refno'])
+        return uuid_generate_v3(id_string)
+
+    # Apply the function to each row and assign the result to a new column 'id'
+    df['id'] = df.apply(generate_uuid, axis=1)
+
+    col_order = ['id', 'source_name', 'GPC_refno', 'region_name', 'region_code', 'temporal_granularity', 'year', 'activity_name', 'activity_value', 
+                 'activity_units', 'gas_name', 'emission_factor_value', 'emission_factor_units', 'emissions_value', 'emissions_units']
+    df = df.reindex(columns=col_order)
+
+    #df.to_csv(f'{absolute_path}/processed_cammesa_AR.csv', sep=",", decimal=".", index=False)
+
+    # Create a SQLAlchemy engine
+    engine = create_engine(args.database_uri)
+
+    # Write the DataFrame to the database table
+    df.to_sql('cammesa_region_emissions_staging', engine, if_exists='replace', index=False)