Skip to content

Commit

Permalink
Merge pull request #42 from datakind/add-train-model-template-nb
Browse files Browse the repository at this point in the history
Add (partial) train+eval model template nb
  • Loading branch information
bdewilde authored Dec 23, 2024
2 parents f961bdb + c430c51 commit 488c442
Show file tree
Hide file tree
Showing 3 changed files with 164 additions and 4 deletions.
5 changes: 3 additions & 2 deletions notebooks/pdp/01-data-assessment-eda-TEMPLATE.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,7 @@
import pandas as pd
import seaborn as sb
from databricks.connect import DatabricksSession
from databricks.sdk.runtime import dbutils

from student_success_tool.analysis import pdp

Expand Down Expand Up @@ -301,7 +302,7 @@
# COMMAND ----------

# decent, general-purpose summarization of a data frame
dbutils.data.summarize(df_course, precise=False) # noqa: F405
dbutils.data.summarize(df_course, precise=False)

# COMMAND ----------

Expand All @@ -311,7 +312,7 @@

# COMMAND ----------

dbutils.data.summarize(df_cohort, precise=True) # noqa: F405
dbutils.data.summarize(df_cohort, precise=True)

# COMMAND ----------

Expand Down
159 changes: 159 additions & 0 deletions notebooks/pdp/03-train-evaluate-model-TEMPLATE.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,159 @@
# Databricks notebook source
# MAGIC %md
# MAGIC # SST Train and Evaluate Model: [SCHOOL]
# MAGIC
# MAGIC Third step in the process of transforming raw (PDP) data into actionable, data-driven insights for advisors: load a prepared modeling dataset, configure experiment tracking framework, then train and evaluate a predictive model.
# MAGIC
# MAGIC ### References
# MAGIC
# MAGIC - [Data science product components (Confluence doc)](https://datakind.atlassian.net/wiki/spaces/TT/pages/237862913/Data+science+product+components+the+modeling+process)
# MAGIC - [Databricks Classification with AutoML](https://docs.databricks.com/en/machine-learning/automl/classification.html)
# MAGIC - [Databricks AutoML Python API reference](https://docs.databricks.com/en/machine-learning/automl/automl-api-reference.html)
# MAGIC - [Databricks runtimes release notes](https://docs.databricks.com/en/release-notes/runtime/index.html)
# MAGIC - TODO: [SCHOOL] website

# COMMAND ----------

# MAGIC %md
# MAGIC # setup

# COMMAND ----------

# MAGIC %sh python --version

# COMMAND ----------

# install dependencies, most of which should come through our 1st-party SST package
# %pip install "student-success-tool==0.1.0"
# %pip install git+https://github.com/datakind/student-success-tool.git@develop

# COMMAND ----------

# MAGIC %restart_python

# COMMAND ----------

import logging

# import mlflow
from databricks.connect import DatabricksSession
from databricks.sdk.runtime import dbutils

from student_success_tool import modeling
from student_success_tool.analysis import pdp

# COMMAND ----------

logging.basicConfig(level=logging.INFO)
logging.getLogger("py4j").setLevel(logging.WARNING) # ignore databricks logger

try:
spark_session = DatabricksSession.builder.getOrCreate()
except Exception:
logging.warning("unable to create spark session; are you in a Databricks runtime?")
pass

# COMMAND ----------

# MAGIC %md
# MAGIC ## configuration

# COMMAND ----------

run_parameters = dict(dbutils.notebook.entry_point.getCurrentBindings())
job_run_id = run_parameters.get("job_run_id", "interactive")

# COMMAND ----------

# TODO: specify school-specific configuration
institution_id = "SCHOOL"
table_name = "CATALOG.SCHEMA.TABLE"
student_id_col = "student_guid"
target_col = "target"
student_group_cols = [
"student_age",
"race",
"ethnicity",
"gender",
"first_gen",
]
optional_automl_parameters = {
"split_col": "split",
"sample_weight_col": "sample_weight",
# "pos_label": True,
# exclude_frameworks: ["lightgbm", "xgboost"],
"timeout_minutes": 5,
}
optimization_metric = "log_loss"

prediction_col = "prediction"
risk_score_col = "risk_score"

optional_automl_parameters["exclude_cols"] = list(
set(optional_automl_parameters.get("exclude_cols", []) + student_group_cols)
)
optional_automl_parameters

# COMMAND ----------

# MAGIC %md
# MAGIC # read modeling dataset

# COMMAND ----------

df = pdp.schemas.PDPLabeledDataSchema(
pdp.dataio.read_data_from_delta_table(table_name, spark_session=spark_session)
)
print(f"rows x cols = {df.shape}")
df.head()

# COMMAND ----------

if split_col := optional_automl_parameters.get("split_col"):
print(df[split_col].value_counts(normalize=True))

# COMMAND ----------

# MAGIC %md
# MAGIC # train model

# COMMAND ----------

summary = modeling.training.run_automl_classification(
df,
target_col=target_col,
optimization_metric=optimization_metric,
institution_id=institution_id,
job_run_id=job_run_id,
student_id_col=student_id_col,
**optional_automl_parameters,
)

experiment_id = summary.experiment.experiment_id
experiment_run_id = summary.best_trial.mlflow_run_id
print(
f"experiment_id: {experiment_id}"
f"\n{optimization_metric} metric distribution = {summary.metric_distribution}"
f"\nbest trial experiment_run_id: {experiment_run_id}"
)

dbutils.jobs.taskValues.set(key="experiment_id", value=experiment_id)
dbutils.jobs.taskValues.set(key="experiment_run_id", value=experiment_run_id)

# COMMAND ----------

# MAGIC %md
# MAGIC # evaluate model

# COMMAND ----------

# MAGIC %md
# MAGIC **TODO:** This doesn't currently work, owing to version incompatibilities between Databricks AutoML runtime and the `student-success-tool` package. Revisit this once we're using our own "auto"-ML framework. Look to existing nb for guidance on additional evaluation needs.

# COMMAND ----------

# mlflow.sklearn.load_model(summary.best_trial.model_path)
model = summary.best_trial.load_model()
model

# COMMAND ----------
4 changes: 2 additions & 2 deletions src/student_success_tool/modeling/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,14 +19,14 @@
def compute_dataset_splits(
df: pd.DataFrame,
*,
labels: Sequence[str] = ("train", "test", "valid"),
labels: Sequence[str] = ("train", "test", "validate"),
fracs: Sequence[float] = (0.6, 0.2, 0.2),
shuffle: bool = True,
seed: t.Optional[int] = None,
) -> pd.Series:
"""
Split input dataset into random subsets with configurable proportions;
by default, Databricks' standard train/test/valid splits are generated.
by default, Databricks' standard train/test/validate splits are generated.
Args:
df
Expand Down

0 comments on commit 488c442

Please sign in to comment.