Skip to content

Commit

Permalink
Merge pull request #473 from Open-Earth-Foundation/ON-1656_CAMMESA_AR
Browse files Browse the repository at this point in the history
CAMMESA_AR integration
  • Loading branch information
amanda-eames authored Apr 30, 2024
2 parents f04500a + 154d826 commit aedd0fc
Show file tree
Hide file tree
Showing 8 changed files with 423 additions and 115 deletions.
29 changes: 23 additions & 6 deletions global-api/import_argentiniandatasets.sh
Original file line number Diff line number Diff line change
Expand Up @@ -7,12 +7,7 @@ else
fi

export PGPASSWORD=$CC_GLOBAL_API_DB_PASSWORD
export DB_URI="postgresql://$CC_GLOBAL_API_DB_USER:$CC_GLOBAL_API_DB_PASSWORD@$CC_GLOBAL_API_DB_HOST/$CC_GLOBAL_API_DB_NAME"

# export DB_URI="postgresql://ccglobal:@localhost/ccglobal"
# export CC_GLOBAL_API_DB_HOST="localhost"
# export CC_GLOBAL_API_DB_USER="ccglobal"
# export CC_GLOBAL_API_DB_NAME="ccglobal"
export DB_URI="postgresql://$CC_GLOBAL_API_DB_USER:$CC_GLOBAL_API_DB_PASSWORD@$CC_GLOBAL_API_DB_HOST/$CC_GLOBAL_API_DB_NAME"

# Argentinian
pushd importer/argentinian_datasets/BEN/
Expand Down Expand Up @@ -47,4 +42,26 @@ psql -h $CC_GLOBAL_API_DB_HOST \
-d $CC_GLOBAL_API_DB_NAME \
-f load_SESCO.sql

popd

# load cammesa

pushd importer/argentinian_datasets/cammesa/

$python_cmd ./transformation_cammesa.py --filepath ./ --database_uri $DB_URI

psql -h $CC_GLOBAL_API_DB_HOST \
-U $CC_GLOBAL_API_DB_USER \
-d $CC_GLOBAL_API_DB_NAME \
-f load_cammesa.sql

popd

# Import datasources

pushd importer/datasource_seeder
psql -h $CC_GLOBAL_API_DB_HOST \
-U $CC_GLOBAL_API_DB_USER \
-d $CC_GLOBAL_API_DB_NAME \
-f ./import_datasource_seeder.sql
popd
16 changes: 0 additions & 16 deletions global-api/importer/argentinian_datasets/cammesa/README

This file was deleted.

21 changes: 21 additions & 0 deletions global-api/importer/argentinian_datasets/cammesa/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
# cammesa - Argentina
Local data of energy generation by power plants in Argentina. This source is used to calculate GHG emissions for subsector of Energy industries in Stationary Energy sector (I.4.4).

1. Extract the activity data from the source [cammesa](https://cammesaweb.cammesa.com/download/factor-de-emision/)

2. Transform the activity into emission data align with the Global API schema:
```bash
python ./importer/argentinian_datasets/cammesa/transformation_cammesa.py --filepath [path where the transformed data will be saved]
```
3. Extract the activity row from the source:
```bash
psql -U ccglobal -d ccglobal -f ./importer/argentinian_datasets/cammesa/loading_cammesa.sql
```

### Directory tree
```sh
.
├── README.md # top level readme
├── transformation_cammesa.py # transformation script
└── load_cammesa.sql # loading script
```
30 changes: 30 additions & 0 deletions global-api/importer/argentinian_datasets/cammesa/load_cammesa.sql
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
-- The ID column is not unique based on the processed records,
-- we have multiple acitivty records for single region_code, year, gas_name, GPC_refno
-- rather than upsert we will just delete existing source data and insert fresh with generated id to make record unique
-- the route for regions will need to be aggregated over region_code, year, gas_name, GPC_refno to get accurate emissions values
DELETE FROM regionwide_emissions WHERE source_name = 'cammesa';

-- Update the main table with the staging table
INSERT INTO regionwide_emissions (
id,source_name,"GPC_refno",region_name,region_code,temporal_granularity,year,activity_name,activity_value,
activity_units,gas_name,emission_factor_value,emission_factor_units,emissions_value,emissions_units
)
SELECT gen_random_uuid() as id,
source_name,
"GPC_refno",
region_name,
region_code,
temporal_granularity,
year,
activity_name,
activity_value,
activity_units,
gas_name,
emission_factor_value,
emission_factor_units,
emissions_value,
emissions_units
FROM cammesa_region_emissions_staging;

-- Drop the staging table
DROP TABLE cammesa_region_emissions_staging;

Large diffs are not rendered by default.

Binary file not shown.
Original file line number Diff line number Diff line change
Expand Up @@ -2,33 +2,137 @@
import argparse
import uuid
import os
import duckdb


#--------------------------------------------------------------------------
# Pre Process
#--------------------------------------------------------------------------

con = duckdb.connect()
con.install_extension("spatial")
con.load_extension("spatial")

df = con.execute("""SELECT Field1 AS Year,
Field2 AS Month,
Field3 AS Machine,
Field4 AS Center,
Field5 AS Agent,
Field6 AS Agent_Desc,
Field7 AS Region,
Field8 AS Provence,
Field9 AS Country,
Field10 AS Machine_Type,
Field11 AS Source_Generation,
Field12 AS Technology,
Field13 AS Hydraulic_Category,
Field14 AS Category_Region,
Field15 AS Net_Generation_MWh
FROM ST_read("raw_cammesa_monthly_electricity_generation.xlsx") WHERE Field1 IS NOT NULL OFFSET 13""").df()

# Close the connection
con.close()
from sqlalchemy import create_engine

def uuid_generate_v3(name, namespace=uuid.NAMESPACE_OID):
"""generate a version 3 UUID from namespace and name"""
assert isinstance(name, str), "name needs to be a string"
assert isinstance(namespace, uuid.UUID), "namespace needs to be a uuid.UUID"
return str(uuid.uuid3(namespace, name))

if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("--filepath", help="path to the files location", required=True)
parser.add_argument(
"--database_uri",
help="database URI (e.g. postgresql://ccglobal:@localhost/ccglobal)",
default=os.environ.get("DB_URI"),
)
args = parser.parse_args()
absolute_path = os.path.abspath(args.filepath)

# read the raw data
raw_data = f'{absolute_path}/raw_cammesa_monthly_electricity_generation.xlsx'
df = pd.read_excel(raw_data)

#--------------------------------------------------------------------------
# Pre-Process
#--------------------------------------------------------------------------

# assign column names
df.columns = df.loc[57]

# cleaning the df
df= df[58:]
df = df.reset_index(drop=True)

# select specific columns
df = df[['AÑO','PROVINCIA','FUENTE GENERACION','COMBUSTIBLE','CONSUMO', 'Factor CO2 por Combustible','EMISIÓN [Ton CO2]']]

# Calculate annual values
df = df.groupby(['AÑO', 'PROVINCIA', 'FUENTE GENERACION', 'COMBUSTIBLE', 'Factor CO2 por Combustible'])[['CONSUMO', 'EMISIÓN [Ton CO2]']].sum().reset_index()

# rename columns
df = df.rename(columns={
'AÑO': 'year',
'PROVINCIA': 'region_name',
'FUENTE GENERACION': 'source_generation',
'COMBUSTIBLE': 'fuel',
'Factor CO2 por Combustible': 'emission_factor_value',
'CONSUMO': 'activity_value',
'EMISIÓN [Ton CO2]': 'emissions_value'
})

# translation of generation sources
generation_source_dict = {
'Renovable': 'renewable',
'Térmica': 'thermal'
}
df['source_generation'] = df['source_generation'].replace(generation_source_dict)

# translation of fuel types
fuel_dict = {
'GAS NATURAL': 'natural gas',
'CARBÓN MINERAL': 'mineral coal',
'FUEL OIL': 'fuel oil',
'GAS OIL': 'gas oil'
}
df['fuel'] = df['fuel'].replace(fuel_dict)

# create a activity name column
df['activity_name'] = df['fuel'] + ' combustion consumption for energy generation from ' + df['source_generation'] + ' plants'

# convert tonnes to kg
df['emissions_value'] *= 1000

# change province name
df['region_name'] = df['region_name'].replace('SGO.DEL ESTERO', 'SANTIAGO DEL ESTERO')

# assigning province CODE based on the province name
region_code_dic = {
'BUENOS AIRES':'AR-B',
'CATAMARCA':'AR-K',
'CHUBUT':'AR-U',
'CORDOBA':'AR-X',
'CORRIENTES':'AR-W',
'ENTRE RIOS':'AR-E',
'JUJUY':'AR-Y',
'LA PAMPA':'AR-L',
'LA RIOJA':'AR-F',
'MENDOZA':'AR-M',
'NEUQUEN':'AR-Q',
'RIO NEGRO':'AR-R',
'SALTA':'AR-A',
'SAN JUAN':'AR-J',
'SAN LUIS':'AR-D',
'SANTA CRUZ':'AR-Z',
'SANTA FE':'AR-S',
'SANTIAGO DEL ESTERO':'AR-G',
'TIERRA DEL FUEGO':'AR-V',
'TUCUMAN':'AR-T',
'MISIONES': 'AR-N',
'FORMOSA': 'AR-P',
'CHACO': 'AR-H'
}
df['region_code'] = df['region_name'].map(region_code_dic)

df = df.drop(columns=['source_generation', 'fuel'])

df.loc[:, 'emission_factor_units'] = 'tonne/tonne'
df.loc[:, 'activity_units'] = 'tonne'
df.loc[:, 'emissions_units'] = 'kg'
df.loc[:, 'source_name'] = 'CAMMESA'
df.loc[:, 'temporal_granularity'] = 'annual'
df.loc[:, 'gas_name'] = 'CO2'
df.loc[:, 'GPC_refno'] = 'I.4.4'


# Define a function to generate UUID for each row
def generate_uuid(row):
id_string = str(row['region_code']) + str(row['emissions_value']) + str(row['year']) + str(row['gas_name']) + str(row['GPC_refno'])
return uuid_generate_v3(id_string)

# Apply the function to each row and assign the result to a new column 'id'
df['id'] = df.apply(generate_uuid, axis=1)

col_order = ['id', 'source_name', 'GPC_refno', 'region_name', 'region_code', 'temporal_granularity', 'year', 'activity_name', 'activity_value',
'activity_units', 'gas_name', 'emission_factor_value', 'emission_factor_units', 'emissions_value', 'emissions_units']
df = df.reindex(columns=col_order)

#df.to_csv(f'{absolute_path}/processed_cammesa_AR.csv', sep=",", decimal=".", index=False)

# Create a SQLAlchemy engine
engine = create_engine(args.database_uri)

# Write the DataFrame to the database table
df.to_sql('cammesa_region_emissions_staging', engine, if_exists='replace', index=False)
Loading

0 comments on commit aedd0fc

Please sign in to comment.