Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add (partial) train+eval model template nb #42

Merged
merged 9 commits into from
Dec 23, 2024
5 changes: 3 additions & 2 deletions notebooks/pdp/01-data-assessment-eda-TEMPLATE.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,7 @@
import pandas as pd
import seaborn as sb
from databricks.connect import DatabricksSession
from databricks.sdk.runtime import dbutils

from student_success_tool.analysis import pdp

Expand Down Expand Up @@ -301,7 +302,7 @@
# COMMAND ----------

# decent, general-purpose summarization of a data frame
dbutils.data.summarize(df_course, precise=False) # noqa: F405
dbutils.data.summarize(df_course, precise=False)

# COMMAND ----------

Expand All @@ -311,7 +312,7 @@

# COMMAND ----------

dbutils.data.summarize(df_cohort, precise=True) # noqa: F405
dbutils.data.summarize(df_cohort, precise=True)

# COMMAND ----------

Expand Down
159 changes: 159 additions & 0 deletions notebooks/pdp/03-train-evaluate-model-TEMPLATE.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,159 @@
# Databricks notebook source
# MAGIC %md
# MAGIC # SST Train and Evaluate Model: [SCHOOL]
# MAGIC
# MAGIC Third step in the process of transforming raw (PDP) data into actionable, data-driven insights for advisors: load a prepared modeling dataset, configure experiment tracking framework, then train and evaluate a predictive model.
# MAGIC
# MAGIC ### References
# MAGIC
# MAGIC - [Data science product components (Confluence doc)](https://datakind.atlassian.net/wiki/spaces/TT/pages/237862913/Data+science+product+components+the+modeling+process)
# MAGIC - [Databricks Classification with AutoML](https://docs.databricks.com/en/machine-learning/automl/classification.html)
# MAGIC - [Databricks AutoML Python API reference](https://docs.databricks.com/en/machine-learning/automl/automl-api-reference.html)
# MAGIC - [Databricks runtimes release notes](https://docs.databricks.com/en/release-notes/runtime/index.html)
# MAGIC - TODO: [SCHOOL] website

# COMMAND ----------

# MAGIC %md
# MAGIC # setup

# COMMAND ----------

# MAGIC %sh python --version

# COMMAND ----------

# install dependencies, most of which should come through our 1st-party SST package
# %pip install "student-success-tool==0.1.0"
# %pip install git+https://github.com/datakind/student-success-tool.git@develop

# COMMAND ----------

# MAGIC %restart_python

# COMMAND ----------

import logging

# import mlflow
from databricks.connect import DatabricksSession
from databricks.sdk.runtime import dbutils

from student_success_tool import modeling
from student_success_tool.analysis import pdp

# COMMAND ----------

logging.basicConfig(level=logging.INFO)
logging.getLogger("py4j").setLevel(logging.WARNING) # ignore databricks logger

try:
spark_session = DatabricksSession.builder.getOrCreate()
except Exception:
logging.warning("unable to create spark session; are you in a Databricks runtime?")
pass

# COMMAND ----------

# MAGIC %md
# MAGIC ## configuration

# COMMAND ----------

run_parameters = dict(dbutils.notebook.entry_point.getCurrentBindings())
job_run_id = run_parameters.get("job_run_id", "interactive")

# COMMAND ----------

# TODO: specify school-specific configuration
institution_id = "SCHOOL"
table_name = "CATALOG.SCHEMA.TABLE"
student_id_col = "student_guid"
target_col = "target"
student_group_cols = [
"student_age",
"race",
"ethnicity",
"gender",
"first_gen",
]
optional_automl_parameters = {
"split_col": "split",
"sample_weight_col": "sample_weight",
# "pos_label": True,
# exclude_frameworks: ["lightgbm", "xgboost"],
"timeout_minutes": 5,
}
optimization_metric = "log_loss"

prediction_col = "prediction"
risk_score_col = "risk_score"

optional_automl_parameters["exclude_cols"] = list(
set(optional_automl_parameters.get("exclude_cols", []) + student_group_cols)
)
optional_automl_parameters

# COMMAND ----------

# MAGIC %md
# MAGIC # read modeling dataset

# COMMAND ----------

df = pdp.schemas.PDPLabeledDataSchema(
pdp.dataio.read_data_from_delta_table(table_name, spark_session=spark_session)
)
print(f"rows x cols = {df.shape}")
df.head()

# COMMAND ----------

if split_col := optional_automl_parameters.get("split_col"):
print(df[split_col].value_counts(normalize=True))

# COMMAND ----------

# MAGIC %md
# MAGIC # train model

# COMMAND ----------

summary = modeling.training.run_automl_classification(
df,
target_col=target_col,
optimization_metric=optimization_metric,
institution_id=institution_id,
job_run_id=job_run_id,
student_id_col=student_id_col,
**optional_automl_parameters,
)

experiment_id = summary.experiment.experiment_id
experiment_run_id = summary.best_trial.mlflow_run_id
print(
f"experiment_id: {experiment_id}"
f"\n{optimization_metric} metric distribution = {summary.metric_distribution}"
f"\nbest trial experiment_run_id: {experiment_run_id}"
)

dbutils.jobs.taskValues.set(key="experiment_id", value=experiment_id)
dbutils.jobs.taskValues.set(key="experiment_run_id", value=experiment_run_id)

# COMMAND ----------

# MAGIC %md
# MAGIC # evaluate model

# COMMAND ----------

# MAGIC %md
# MAGIC **TODO:** This doesn't currently work, owing to version incompatibilities between Databricks AutoML runtime and the `student-success-tool` package. Revisit this once we're using our own "auto"-ML framework. Look to existing nb for guidance on additional evaluation needs.

# COMMAND ----------

# mlflow.sklearn.load_model(summary.best_trial.model_path)
model = summary.best_trial.load_model()
model

# COMMAND ----------
4 changes: 2 additions & 2 deletions src/student_success_tool/modeling/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,14 +19,14 @@
def compute_dataset_splits(
df: pd.DataFrame,
*,
labels: Sequence[str] = ("train", "test", "valid"),
labels: Sequence[str] = ("train", "test", "validate"),
fracs: Sequence[float] = (0.6, 0.2, 0.2),
shuffle: bool = True,
seed: t.Optional[int] = None,
) -> pd.Series:
"""
Split input dataset into random subsets with configurable proportions;
by default, Databricks' standard train/test/valid splits are generated.
by default, Databricks' standard train/test/validate splits are generated.

Args:
df
Expand Down
Loading