Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Implement translations for data source seeder/ data catalogue route #496

Merged
merged 8 commits into from
Aug 5, 2024
Merged
Show file tree
Hide file tree
Changes from 7 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions global-api/alembic.ini
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,7 @@ version_path_separator = os # Use os.pathsep. Default configuration used for ne
# output_encoding = utf-8

sqlalchemy.url = postgresql://ccglobal:@cc-global-api-db/ccglobal
#sqlalchemy.url = postgresql://ccglobal:@localhost/ccglobal

[post_write_hooks]
# post_write_hooks defines scripts or Python functions that are run
Expand Down
7 changes: 3 additions & 4 deletions global-api/import_argentiniandatasets.sh
Original file line number Diff line number Diff line change
Expand Up @@ -60,8 +60,7 @@ popd
# Import datasources

pushd importer/datasource_seeder
psql -h $CC_GLOBAL_API_DB_HOST \
-U $CC_GLOBAL_API_DB_USER \
-d $CC_GLOBAL_API_DB_NAME \
-f ./import_datasource_seeder.sql

$python_cmd ./import_dataseeder.py --database_uri $DB_URI

popd
7 changes: 3 additions & 4 deletions global-api/import_everything.sh
Original file line number Diff line number Diff line change
Expand Up @@ -126,10 +126,9 @@ popd
# Import datasources

pushd importer/datasource_seeder
psql -h $CC_GLOBAL_API_DB_HOST \
-U $CC_GLOBAL_API_DB_USER \
-d $CC_GLOBAL_API_DB_NAME \
-f ./import_datasource_seeder.sql

$python_cmd ./import_dataseeder.py --database_uri $DB_URI

popd


Expand Down
3 changes: 2 additions & 1 deletion global-api/importer/datasource_seeder/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -6,8 +6,9 @@ This is a catalogue of datasources that are available for use by CityCatalyst.
- `import_datasource_seeder.sql` imports the `datasource_seeder.csv` file into the database. It will update existing records and add new ones. You can run it like this:

```bash
psql -U ccglobal -d ccglobal -f import_datasource_seeder.sql
python3 import_dataseeder.py --database_uri postgresql://ccglobal:@localhost/ccglobal
```

## Datasource catalogue structure

- `datasource_id`: unique UUID for the datasource
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ TRUNCATE datasource_staging;

/* Load the staging table from the transformed file */

\copy datasource_staging (datasource_id,publisher_id,datasource_name,dataset_name,dataset_description,source_type,access_type,dataset_url,geographical_location,start_year,end_year,latest_accounting_year,frequency_of_update,spatial_resolution,language,accessibility,data_quality,notes,units,methodology_description,methodology_url,transformation_description,retrieval_method,api_endpoint,gpc_reference_number,scope) from 'datasource_seeder.csv' with CSV HEADER;
\copy datasource_staging (datasource_id,publisher_id,datasource_name,dataset_name,dataset_description,source_type,access_type,dataset_url,geographical_location,start_year,end_year,latest_accounting_year,frequency_of_update,spatial_resolution,language,accessibility,data_quality,notes,units,methodology_description,methodology_url,transformation_description,retrieval_method,api_endpoint,gpc_reference_number,scope) from 'datasource_seeder.yaml' with CSV HEADER;

/* Update the main table with the staging table */

Expand Down
1,855 changes: 1,855 additions & 0 deletions global-api/importer/datasource_seeder/datasource_seeder.yaml

Large diffs are not rendered by default.

131 changes: 131 additions & 0 deletions global-api/importer/datasource_seeder/import_dataseeder.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,131 @@
import yaml
import pandas as pd
from sqlalchemy import create_engine
import json
import argparse
import os
from sqlalchemy.sql import text

if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument(
"--database_uri",
help="database URI (e.g. postgresql://ccglobal:@localhost/ccglobal)",
default=os.environ.get("DB_URI"),
)
args = parser.parse_args()

# Load YAML file into Python dictionary
with open('datasource_seeder.yaml', 'r') as file:
data_dict = yaml.safe_load(file)

# Convert dictionary to DataFrame
df = pd.DataFrame.from_dict(data_dict)

# Check if column contains dictionaries convert to json
for column in df.columns:
if df[column].apply(lambda x: isinstance(x, dict)).all():
df[column] = df[column].apply(json.dumps)

# Connect to PostgreSQL
engine = create_engine(args.database_uri)

# Insert DataFrame into PostgreSQL table
df.to_sql('datasource_staging', engine, if_exists='replace', index=False)

#update sql script

sql_query = """
INSERT INTO datasource (
datasource_id,
publisher_id,
datasource_name,
dataset_name,
dataset_description,
source_type,
access_type,
dataset_url,
geographical_location,
start_year,
end_year,
latest_accounting_year,
frequency_of_update,
spatial_resolution,
language,
accessibility,
data_quality,
notes,
units,
methodology_description,
methodology_url,
transformation_description,
retrieval_method,
api_endpoint,
gpc_reference_number,
scope
)
SELECT
datasource_id::uuid,
publisher_id,
datasource_name,
dataset_name::jsonb as dataset_name,
dataset_description::jsonb as dataset_description,
source_type,
access_type,
dataset_url,
geographical_location,
start_year,
end_year,
latest_accounting_year,
frequency_of_update,
spatial_resolution,
language,
accessibility,
data_quality,
notes,
units,
methodology_description::jsonb as methodology_description,
methodology_url,
transformation_description::jsonb as transformation_description,
retrieval_method,
api_endpoint,
gpc_reference_number,
scope
FROM datasource_staging
ON CONFLICT ON CONSTRAINT datasource_pkey
DO UPDATE SET
publisher_id = EXCLUDED.publisher_id,
datasource_name = EXCLUDED.datasource_name,
dataset_name = EXCLUDED.dataset_name,
dataset_description = EXCLUDED.dataset_description,
source_type = EXCLUDED.source_type,
access_type = EXCLUDED.access_type,
dataset_url = EXCLUDED.dataset_url,
geographical_location = EXCLUDED.geographical_location,
start_year = EXCLUDED.start_year,
end_year = EXCLUDED.end_year,
latest_accounting_year = EXCLUDED.latest_accounting_year,
frequency_of_update = EXCLUDED.frequency_of_update,
spatial_resolution = EXCLUDED.spatial_resolution,
language = EXCLUDED.language,
accessibility = EXCLUDED.accessibility,
data_quality = EXCLUDED.data_quality,
notes = EXCLUDED.notes,
units = EXCLUDED.units,
methodology_description = EXCLUDED.methodology_description,
methodology_url = EXCLUDED.methodology_url,
transformation_description = EXCLUDED.transformation_description,
retrieval_method = EXCLUDED.retrieval_method,
api_endpoint = EXCLUDED.api_endpoint,
gpc_reference_number = EXCLUDED.gpc_reference_number,
scope = EXCLUDED.scope,
modified_date = now();
"""

with engine.connect() as connection:
try:
result = connection.execute(text(sql_query))
connection.commit()
print("Query completed successfully.")
except Exception as e:
print("Error updating datasource table:", e)
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
"""allowing_other_languages

Revision ID: 77d1cb7b24df
Revises: 949c5b9cc18d
Create Date: 2024-05-01 13:06:46.227806

"""
from typing import Sequence, Union

from alembic import op
import sqlalchemy as sa
from sqlalchemy.dialects.postgresql import JSONB

# revision identifiers, used by Alembic.
revision: str = '77d1cb7b24df'
down_revision: Union[str, None] = '949c5b9cc18d'
branch_labels: Union[str, Sequence[str], None] = None
depends_on: Union[str, Sequence[str], None] = None

def upgrade():
# Drop the old TEXT columns
op.drop_column('datasource', 'dataset_name')
op.drop_column('datasource', 'dataset_description')
op.drop_column('datasource', 'methodology_description')
op.drop_column('datasource', 'transformation_description')

# Add the new jsonb columns with the original column names
op.add_column('datasource', sa.Column('dataset_name', JSONB, nullable=True))
op.add_column('datasource', sa.Column('dataset_description', JSONB, nullable=True))
op.add_column('datasource', sa.Column('methodology_description', JSONB, nullable=True))
op.add_column('datasource', sa.Column('transformation_description', JSONB, nullable=True))

def downgrade():
# Drop the new jsonb columns
op.drop_column('datasource', 'dataset_name')
op.drop_column('datasource', 'dataset_description')
op.drop_column('datasource', 'methodology_description')
op.drop_column('datasource', 'transformation_description')

# Add back the old TEXT columns
op.add_column('datasource', sa.Column('dataset_name', sa.TEXT(), nullable=True))
op.add_column('datasource', sa.Column('dataset_description', sa.Text(), nullable=True))
op.add_column('datasource', sa.Column('methodology_description', sa.TEXT(), nullable=True))
op.add_column('datasource', sa.Column('transformation_description', sa.TEXT(), nullable=True))
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
"""fix_column_type_activity_emissions

Revision ID: c360f7e67f44
Revises: 949c5b9cc18d
Create Date: 2024-05-22 08:21:13.727742

"""
from typing import Sequence, Union

from alembic import op
import sqlalchemy as sa


# revision identifiers, used by Alembic.
revision: str = 'c360f7e67f44'
down_revision: Union[str, None] = '949c5b9cc18d'
branch_labels: Union[str, Sequence[str], None] = None
depends_on: Union[str, Sequence[str], None] = None


def upgrade() -> None:
op.alter_column('country_code', 'activity_value',
existing_type=sa.String(),
type_=sa.Float(),
postgresql_using='activity_value::double precision')
op.alter_column('country_code', 'emissions_value',
existing_type=sa.String(),
type_=sa.Float(),
postgresql_using='emissions_value::double precision')

def downgrade() -> None:
op.alter_column('country_code', 'activity_value',
existing_type=sa.Float(),
type_=sa.String(),
postgresql_using='activity_value::text')
op.alter_column('country_code', 'emissions_value',
existing_type=sa.Float(),
type_=sa.String(),
postgresql_using='emissions_value::text')
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
"""Merge heads from develop and languages

Revision ID: f8f5b2a87fff
Revises: c360f7e67f44, 77d1cb7b24df
Create Date: 2024-05-28 08:30:14.654581

"""
from typing import Sequence, Union

from alembic import op
import sqlalchemy as sa


# revision identifiers, used by Alembic.
revision: str = 'f8f5b2a87fff'
down_revision: Union[str, None] = ('c360f7e67f44', '77d1cb7b24df')
branch_labels: Union[str, Sequence[str], None] = None
depends_on: Union[str, Sequence[str], None] = None


def upgrade() -> None:
pass


def downgrade() -> None:
pass
61 changes: 60 additions & 1 deletion global-api/routes/catalogue_endpoint.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,17 +3,76 @@
from db.database import SessionLocal
from models.datasource import Datasource
from typing import Optional
from sqlalchemy import text
import csv
import io

api_router = APIRouter(prefix="/api/v0")


## to be deprecated ##
@api_router.get("/catalogue")
def get_datasources(format: Optional[str] = None):

records = None

with SessionLocal() as session:
query = text("""
SELECT datasource_id,
publisher_id,
source_type,
dataset_url,
access_type,
geographical_location,
start_year,
end_year,
latest_accounting_year,
frequency_of_update,
spatial_resolution,
"language",
accessibility,
data_quality,
notes,
units,
methodology_url,
retrieval_method,
api_endpoint,
gpc_reference_number,
created_date,
modified_date,
datasource_name,
"scope",
dataset_name->>'en'::varchar as dataset_name,
dataset_description->>'en'::varchar as dataset_description,
methodology_description->>'en'::varchar as methodology_description,
transformation_description->>'en'::varchar as transformation_description
FROM public.datasource
ORDER BY gpc_reference_number DESC;
""")
result = session.execute(query)
records = result.mappings().all()

if not records:
raise HTTPException(status_code=404, detail="No data available")

if format == "csv":
output = io.StringIO()
csvwriter = csv.writer(output)
names = [column.name for column in Datasource.__table__.columns]
csvwriter.writerow(names)
for datasource in records:
csvwriter.writerow([getattr(datasource, name) for name in names])
response = PlainTextResponse(content=output.getvalue(), media_type="text/csv")
else:
response = {"datasources": records}

return response


@api_router.get("/catalogue/i18n")
def get_datasources(format: Optional[str] = None):

records = None

with SessionLocal() as session:
query = session.query(Datasource).order_by(Datasource.gpc_reference_number.desc())
records = query.all()
Expand Down