Skip to content

Commit

Permalink
Finishing touches on superset before user testing (#3888)
Browse files Browse the repository at this point in the history
* Add dashboard creation automation, fix docker compose error, add logo, increase superset cloud run resources

* Add jinja extensnion to template
  • Loading branch information
bendnorman authored and jdangerx committed Oct 4, 2024
1 parent 8ce4183 commit a3bc4fa
Show file tree
Hide file tree
Showing 8 changed files with 554 additions and 3 deletions.
32 changes: 32 additions & 0 deletions superset/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -117,3 +117,35 @@ superset fab export-roles --path roles/roles.json
```

I've noticed the importing can sometimes take a few minutes.

## How to programmatically create PUDL table dashboards

We've designed a dashboard template for filtering and downloading PUDL tables. The `./automation/create_table_dashboards.py` script
programmatically creates one of these dashboards and all the charts it depends on.

To use this script you'll first need to assign the `superset-bot` credentials to env vars:

```
export SUPERSET_USERNAME=superset-bot
export SUPERSET_PASSWORD={grab password from Google Secrets}
```

Then, to create a dashboard, run:

```
python ./automation/create_table_dashboards.py [TABLE_NAMES]...
```

### Limitations / Open questions

Generally the script is good enough for managing a few dashboards while we do user testing.
It needs a lot of work if we want to use it in production:

- There is no "create all the tables option". We could import this information from our metadata
- This script creates the dataset, chart and dashboard from scratch so you'll have to delete everything if you want to update a dashboard.
- It's still unclear how we'll want to programmatically update these charts. I think the best way to do it is to have a canonical
template chart that we edit in the UI, use the API to grab the configuration of the dashboard, recreate the `table_download_position.json` jinja template
and recreate all the dashboard elements.
- The dashboard template does not add table descriptions to the Data Dictionary tab of the dashboard and it does not add filters
- The script does not automatically publish the dashboard
- For some reason, when the "Public" role has any permissions, the API authenticate as an anonymous user and throws an [error](https://github.com/apache/superset/discussions/18284) when sending POST requests.
264 changes: 264 additions & 0 deletions superset/automation/create_table_dashboards.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,264 @@
"""Script for creating Superset dashboards of PUDL tables."""

import json
import logging
import os
from pathlib import Path

import click
import coloredlogs
import requests
from jinja2 import Template

logger = logging.getLogger(__name__)


class SupersetClient:
"""A client for interacting with the Superset API."""

def __init__(self, base_url: str = "https://superset.catalyst.coop") -> None:
"""Initialize the Superset client."""
self.base_url = base_url
self.session = self._init_session()

def _get_access_token(self, session: requests.Session) -> str:
"""Get the access token for the Superset API.
Note: I'm not sure how to authenticate using OAuth.
Args:
session: The requests session to use.
Returns:
The access token.
"""
payload = {
"username": os.environ["SUPERSET_USERNAME"],
"password": os.environ["SUPERSET_PASSWORD"],
"provider": "db",
"refresh": True,
}
r = session.post(self.base_url + "/api/v1/security/login", json=payload)

access_token = r.json()["access_token"]
return access_token

def _init_session(self) -> requests.Session:
"""Initialize a requests session with the necessary headers."""
session = requests.Session()
session.headers["Authorization"] = "Bearer " + self._get_access_token(session)
session.headers["Content-Type"] = "application/json"

csrf_url = f"{self.base_url}/api/v1/security/csrf_token/"
csrf_res = session.get(csrf_url)
csrf_token = csrf_res.json()["result"]

session.headers["Referer"] = csrf_url
session.headers["X-CSRFToken"] = csrf_token
return session

def create_dataset(self, dataset_name, sql=None, database_id=1) -> int:
"""Create a new dataset in Superset.
Args:
dataset_name: The name of the dataset.
sql: The SQL query to generate the dataset.
database_id: The ID of the database to use. The current default is the PUDL database id.
Returns:
The ID of the created dataset.
"""
data = {
"database": database_id,
"schema": "pudl.main",
}
data["table_name"] = dataset_name
if sql:
data["sql"] = sql

r = self.session.post(self.base_url + "/api/v1/dataset/", json=data)

if r.status_code != 201:
raise ValueError(f"{r.status_code}: {r.content}")
return r.json()["id"]

def create_table_chart(self, dataset_id: int, table_name: str) -> int:
"""Create a table chart in Superset.
Args:
dataset_id: The ID of the dataset to create the chart for.
table_name: The name of the table to create the chart for.
Returns:
The ID of the created chart.
"""
file_path = Path(__file__).parent / "templates/charts/table_download.json"

# Open the file and load the data
with file_path.open() as file:
params = json.load(file)

# Get all the columns from the dataset because there is no "select all" option
r = self.session.get(self.base_url + f"/api/v1/dataset/{dataset_id}")
columns = [col["column_name"] for col in r.json()["result"]["columns"]]
params["all_columns"] = columns

data = {
"datasource_id": dataset_id,
"slice_name": table_name,
"datasource_type": "table",
"viz_type": "table",
"params": json.dumps(params),
}

r = self.session.post(self.base_url + "/api/v1/chart/", json=data)
if r.status_code != 201:
raise ValueError(f"{r.status_code}: {r.content}")
return r.json()["id"]

def create_row_count_chart(self, dataset_id: int, table_name: str) -> int:
"""Create a table row count chart in Superset.
Args:
dataset_id: The ID of the dataset to create the chart for.
table_name: The name of the table to create the chart for.
Returns:
The ID of the created chart.
"""
file_path = Path(__file__).parent / "templates/charts/row_count.json"

# Open the file and load the data
with file_path.open() as file:
params = json.load(file)

data = {
"datasource_id": dataset_id,
"slice_name": f"{table_name} Row Count",
"datasource_type": "table",
"viz_type": "big_number_total",
"params": json.dumps(params),
}

r = self.session.post(self.base_url + "/api/v1/chart/", json=data)
if r.status_code != 201:
raise ValueError(f"{r.status_code}: {r.content}")
return r.json()["id"]

def create_dashboard(
self,
table_name: str,
table_dataset_id: int,
table_chart_id: int,
data_dictionary_chart_id: int,
row_count_chart_id: int,
):
"""Create a the PUDL table dashboard.
Args:
table_name: The name of the table to create the dashboard for.
table_dataset_id: The ID of the dataset for the table.
table_chart_id: The ID of the chart for the table.
data_dictionary_chart_id: The ID of the chart for the data dictionary.
row_count_chart_id: The ID of the chart for the row
"""
# Load JSON template from file
file_path = (
Path(__file__).parent / "templates/dashboards/table_download_position.json"
)
with file_path.open() as file:
json_template_str = file.read()

# Create a jinja2 template object
template = Template(json_template_str)

# Define the values to be substituted
data = {
"table_chart_id": table_chart_id,
"data_dictionary_chart_id": data_dictionary_chart_id,
"row_count_chart_id": row_count_chart_id,
"table_name": table_name,
"table_dataset_id": table_dataset_id,
}

# Render the template with the actual values
rendered_json_str = template.render(data)

# create the dashboard
data = {
"dashboard_title": table_name,
"position_json": rendered_json_str,
"slug": table_name,
}
r = self.session.post(self.base_url + "/api/v1/dashboard/", json=data)
if r.status_code != 201:
raise ValueError(f"{r.status_code}: {r.content}")

dash_id = r.json()["id"]

# add all the dashboard id to the charts
# If you don't do this the dashboard layout will be correct but superset won't be able to find the charts
data = {"dashboards": [dash_id]}
for chart_id in (table_chart_id, data_dictionary_chart_id, row_count_chart_id):
r = self.session.put(self.base_url + f"/api/v1/chart/{chart_id}", json=data)
if r.status_code != 200:
raise ValueError(f"{r.status_code}: {r.content}")

def create_all_table_dashboard_assets(self, table_name: str):
"""Create all the assets needed for a PUDL table dashboard.
Args:
table_name: The name of the table to create the dashboard for.
"""
table_dataset_id = self.create_dataset(table_name)

# create the data dictionary dataset
sql = f"SELECT column_name, data_type, comment AS description FROM duckdb_columns() where table_name = '{table_name}';" # noqa: S608
data_dict_dataset_id = self.create_dataset(
table_name + " Column Descriptions", sql=sql
)

# create the table chart
table_chart_id = self.create_table_chart(table_dataset_id, table_name)
# create the data dictionary table chart
data_dictionary_chart_id = self.create_table_chart(
data_dict_dataset_id, f"{table_name} Column Descriptions"
) # TODO: add searchable option to the table!
row_count_chart_id = self.create_row_count_chart(table_dataset_id, table_name)

# create the dashboard
self.create_dashboard(
table_name,
table_dataset_id,
table_chart_id,
data_dictionary_chart_id,
row_count_chart_id,
)


@click.command()
@click.argument("table_names", nargs=-1)
@click.option(
"--loglevel",
help="Set logging level (DEBUG, INFO, WARNING, ERROR, or CRITICAL).",
default="INFO",
)
def create_table_dashboards(loglevel, table_names):
"""This command accepts a variable number of arguments."""
superset_logger = logging.getLogger()
log_format = "%(asctime)s [%(levelname)8s] %(name)s:%(lineno)s %(message)s"
coloredlogs.install(fmt=log_format, level=loglevel, logger=superset_logger)

if not table_names:
raise click.UsageError("At least one argument is required.")

client = SupersetClient()
for table_name in table_names:
logger.info(f"Creating dashboard for {table_name}")
client.create_all_table_dashboard_assets(table_name)
logger.info(f"Dashboard for {table_name} created successfully.")


if __name__ == "__main__":
create_table_dashboards()
36 changes: 36 additions & 0 deletions superset/automation/templates/charts/row_count.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
{
"viz_type": "big_number_total",
"metric": {
"aggregate": null,
"column": null,
"datasourceWarning": false,
"expressionType": "SQL",
"hasCustomLabel": false,
"label": "COUNT(*)",
"optionName": "metric_ds9fqpcx819_axxvkrtyf0c",
"sqlExpression": "COUNT(*)"
},
"adhoc_filters": [
{
"clause": "WHERE",
"comparator": "No filter",
"expressionType": "SIMPLE",
"operator": "TEMPORAL_RANGE",
"subject": "report_date"
}
],
"subheader": "Warning! You are limited to viewing and downloading 100,000 rows. You can apply filters to grab the data you need.",
"header_font_size": 0.6,
"subheader_font_size": 0.15,
"y_axis_format": "SMART_NUMBER",
"time_format": "smart_date",
"conditional_formatting": [
{
"colorScheme": "#A7323F",
"column": "COUNT(*)",
"operator": ">",
"targetValue": 100000
}
],
"extra_form_data": {}
}
26 changes: 26 additions & 0 deletions superset/automation/templates/charts/table_download.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
{
"viz_type": "table",
"query_mode": "raw",
"groupby": [],
"temporal_columns_lookup": {},
"all_columns": [],
"percent_metrics": [],
"adhoc_filters": [
{
"clause": "WHERE",
"comparator": "No filter",
"expressionType": "SIMPLE",
"operator": "TEMPORAL_RANGE",
"subject": "report_date"
}
],
"order_by_cols": [],
"server_pagination": false,
"row_limit": "100000",
"table_timestamp_format": "smart_date",
"show_cell_bars": true,
"color_pn": true,
"conditional_formatting": [],
"extra_form_data": {},
"include_search": true
}
Loading

0 comments on commit a3bc4fa

Please sign in to comment.