Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Finishing touches on superset before user testing #3888

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
32 changes: 32 additions & 0 deletions superset/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -117,3 +117,35 @@ superset fab export-roles --path roles/roles.json
```

I've noticed the importing can sometimes take a few minutes.

## How to programmatically create PUDL table dashboards

We've designed a dashboard template for filtering and downloading PUDL tables. The `./automation/create_table_dashboards.py` script
programmatically creates one of these dashboards and all the charts it depends on.

To use this script you'll first need to assign the `superset-bot` credentials to env vars:

```
export SUPERSET_USERNAME=superset-bot
export SUPERSET_PASSWORD={grab password from Google Secrets}
```

Then, to create a dashboard, run:

```
python ./automation/create_table_dashboards.py [TABLE_NAMES]...
```

### Limitations / Open questions
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is deserving of a more thorough right up which we can do later if we want to move superset into production.


Generally the script is good enough for managing a few dashboards while we do user testing.
It needs a lot of work if we want to use it in production:

- There is no "create all the tables option". We could import this information from our metadata
- This script creates the dataset, chart and dashboard from scratch so you'll have to delete everything if you want to update a dashboard.
- It's still unclear how we'll want to programmatically update these charts. I think the best way to do it is to have a canonical
template chart that we edit in the UI, use the API to grab the configuration of the dashboard, recreate the `table_download_position.json` jinja template
and recreate all the dashboard elements.
- The dashboard template does not add table descriptions to the Data Dictionary tab of the dashboard and it does not add filters
- The script does not automatically publish the dashboard
- For some reason, when the "Public" role has any permissions, the API authenticate as an anonymous user and throws an [error](https://github.com/apache/superset/discussions/18284) when sending POST requests.
264 changes: 264 additions & 0 deletions superset/automation/create_table_dashboards.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,264 @@
"""Script for creating Superset dashboards of PUDL tables."""

import json
import logging
import os
from pathlib import Path

import click
import coloredlogs
import requests
from jinja2 import Template

logger = logging.getLogger(__name__)


class SupersetClient:
"""A client for interacting with the Superset API."""

def __init__(self, base_url: str = "https://superset.catalyst.coop") -> None:
"""Initialize the Superset client."""
self.base_url = base_url
self.session = self._init_session()

def _get_access_token(self, session: requests.Session) -> str:
"""Get the access token for the Superset API.

Note: I'm not sure how to authenticate using OAuth.

Args:
session: The requests session to use.

Returns:
The access token.
"""
payload = {
"username": os.environ["SUPERSET_USERNAME"],
"password": os.environ["SUPERSET_PASSWORD"],
"provider": "db",
"refresh": True,
}
r = session.post(self.base_url + "/api/v1/security/login", json=payload)

access_token = r.json()["access_token"]
return access_token

def _init_session(self) -> requests.Session:
"""Initialize a requests session with the necessary headers."""
session = requests.Session()
session.headers["Authorization"] = "Bearer " + self._get_access_token(session)
session.headers["Content-Type"] = "application/json"

csrf_url = f"{self.base_url}/api/v1/security/csrf_token/"
csrf_res = session.get(csrf_url)
csrf_token = csrf_res.json()["result"]

session.headers["Referer"] = csrf_url
session.headers["X-CSRFToken"] = csrf_token
return session

def create_dataset(self, dataset_name, sql=None, database_id=1) -> int:
"""Create a new dataset in Superset.

Args:
dataset_name: The name of the dataset.
sql: The SQL query to generate the dataset.
database_id: The ID of the database to use. The current default is the PUDL database id.

Returns:
The ID of the created dataset.
"""
data = {
"database": database_id,
"schema": "pudl.main",
}
data["table_name"] = dataset_name
if sql:
data["sql"] = sql

r = self.session.post(self.base_url + "/api/v1/dataset/", json=data)

if r.status_code != 201:
raise ValueError(f"{r.status_code}: {r.content}")
return r.json()["id"]

def create_table_chart(self, dataset_id: int, table_name: str) -> int:
"""Create a table chart in Superset.

Args:
dataset_id: The ID of the dataset to create the chart for.
table_name: The name of the table to create the chart for.

Returns:
The ID of the created chart.
"""
file_path = Path(__file__).parent / "templates/charts/table_download.json"

# Open the file and load the data
with file_path.open() as file:
params = json.load(file)

# Get all the columns from the dataset because there is no "select all" option
r = self.session.get(self.base_url + f"/api/v1/dataset/{dataset_id}")
columns = [col["column_name"] for col in r.json()["result"]["columns"]]
params["all_columns"] = columns

data = {
"datasource_id": dataset_id,
"slice_name": table_name,
"datasource_type": "table",
"viz_type": "table",
"params": json.dumps(params),
}

r = self.session.post(self.base_url + "/api/v1/chart/", json=data)
if r.status_code != 201:
raise ValueError(f"{r.status_code}: {r.content}")
return r.json()["id"]

def create_row_count_chart(self, dataset_id: int, table_name: str) -> int:
"""Create a table row count chart in Superset.

Args:
dataset_id: The ID of the dataset to create the chart for.
table_name: The name of the table to create the chart for.

Returns:
The ID of the created chart.
"""
file_path = Path(__file__).parent / "templates/charts/row_count.json"

# Open the file and load the data
with file_path.open() as file:
params = json.load(file)

data = {
"datasource_id": dataset_id,
"slice_name": f"{table_name} Row Count",
"datasource_type": "table",
"viz_type": "big_number_total",
"params": json.dumps(params),
}

r = self.session.post(self.base_url + "/api/v1/chart/", json=data)
if r.status_code != 201:
raise ValueError(f"{r.status_code}: {r.content}")
return r.json()["id"]

def create_dashboard(
self,
table_name: str,
table_dataset_id: int,
table_chart_id: int,
data_dictionary_chart_id: int,
row_count_chart_id: int,
):
"""Create a the PUDL table dashboard.

Args:
table_name: The name of the table to create the dashboard for.
table_dataset_id: The ID of the dataset for the table.
table_chart_id: The ID of the chart for the table.
data_dictionary_chart_id: The ID of the chart for the data dictionary.
row_count_chart_id: The ID of the chart for the row

"""
# Load JSON template from file
file_path = (
Path(__file__).parent / "templates/dashboards/table_download_position.json"
)
with file_path.open() as file:
json_template_str = file.read()

# Create a jinja2 template object
template = Template(json_template_str)

# Define the values to be substituted
data = {
"table_chart_id": table_chart_id,
"data_dictionary_chart_id": data_dictionary_chart_id,
"row_count_chart_id": row_count_chart_id,
"table_name": table_name,
"table_dataset_id": table_dataset_id,
}

# Render the template with the actual values
rendered_json_str = template.render(data)

# create the dashboard
data = {
"dashboard_title": table_name,
"position_json": rendered_json_str,
"slug": table_name,
}
r = self.session.post(self.base_url + "/api/v1/dashboard/", json=data)
if r.status_code != 201:
raise ValueError(f"{r.status_code}: {r.content}")

dash_id = r.json()["id"]

# add all the dashboard id to the charts
# If you don't do this the dashboard layout will be correct but superset won't be able to find the charts
data = {"dashboards": [dash_id]}
for chart_id in (table_chart_id, data_dictionary_chart_id, row_count_chart_id):
r = self.session.put(self.base_url + f"/api/v1/chart/{chart_id}", json=data)
if r.status_code != 200:
raise ValueError(f"{r.status_code}: {r.content}")

def create_all_table_dashboard_assets(self, table_name: str):
"""Create all the assets needed for a PUDL table dashboard.

Args:
table_name: The name of the table to create the dashboard for.
"""
table_dataset_id = self.create_dataset(table_name)

# create the data dictionary dataset
sql = f"SELECT column_name, data_type, comment AS description FROM duckdb_columns() where table_name = '{table_name}';" # noqa: S608
data_dict_dataset_id = self.create_dataset(
table_name + " Column Descriptions", sql=sql
)

# create the table chart
table_chart_id = self.create_table_chart(table_dataset_id, table_name)
# create the data dictionary table chart
data_dictionary_chart_id = self.create_table_chart(
data_dict_dataset_id, f"{table_name} Column Descriptions"
) # TODO: add searchable option to the table!
row_count_chart_id = self.create_row_count_chart(table_dataset_id, table_name)

# create the dashboard
self.create_dashboard(
table_name,
table_dataset_id,
table_chart_id,
data_dictionary_chart_id,
row_count_chart_id,
)


@click.command()
@click.argument("table_names", nargs=-1)
@click.option(
"--loglevel",
help="Set logging level (DEBUG, INFO, WARNING, ERROR, or CRITICAL).",
default="INFO",
)
def create_table_dashboards(loglevel, table_names):
"""This command accepts a variable number of arguments."""
superset_logger = logging.getLogger()
log_format = "%(asctime)s [%(levelname)8s] %(name)s:%(lineno)s %(message)s"
coloredlogs.install(fmt=log_format, level=loglevel, logger=superset_logger)

if not table_names:
raise click.UsageError("At least one argument is required.")

client = SupersetClient()
for table_name in table_names:
logger.info(f"Creating dashboard for {table_name}")
client.create_all_table_dashboard_assets(table_name)
logger.info(f"Dashboard for {table_name} created successfully.")


if __name__ == "__main__":
create_table_dashboards()
36 changes: 36 additions & 0 deletions superset/automation/templates/charts/row_count.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
{
"viz_type": "big_number_total",
"metric": {
"aggregate": null,
"column": null,
"datasourceWarning": false,
"expressionType": "SQL",
"hasCustomLabel": false,
"label": "COUNT(*)",
"optionName": "metric_ds9fqpcx819_axxvkrtyf0c",
"sqlExpression": "COUNT(*)"
},
"adhoc_filters": [
{
"clause": "WHERE",
"comparator": "No filter",
"expressionType": "SIMPLE",
"operator": "TEMPORAL_RANGE",
"subject": "report_date"
}
],
"subheader": "Warning! You are limited to viewing and downloading 100,000 rows. You can apply filters to grab the data you need.",
"header_font_size": 0.6,
"subheader_font_size": 0.15,
"y_axis_format": "SMART_NUMBER",
"time_format": "smart_date",
"conditional_formatting": [
{
"colorScheme": "#A7323F",
"column": "COUNT(*)",
"operator": ">",
"targetValue": 100000
}
],
"extra_form_data": {}
}
26 changes: 26 additions & 0 deletions superset/automation/templates/charts/table_download.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
{
"viz_type": "table",
"query_mode": "raw",
"groupby": [],
"temporal_columns_lookup": {},
"all_columns": [],
"percent_metrics": [],
"adhoc_filters": [
{
"clause": "WHERE",
"comparator": "No filter",
"expressionType": "SIMPLE",
"operator": "TEMPORAL_RANGE",
"subject": "report_date"
}
],
"order_by_cols": [],
"server_pagination": false,
"row_limit": "100000",
"table_timestamp_format": "smart_date",
"show_cell_bars": true,
"color_pn": true,
"conditional_formatting": [],
"extra_form_data": {},
"include_search": true
}
Loading
Loading