Merge pull request #41 from datakind/pdp-extend-modeling-functionality

[pdp] Extend modeling functionality
datakind · Dec 17, 2024 · 8970ef8 · 8970ef8
2 parents a776967 + 992401b
commit 8970ef8
Show file tree

Hide file tree

Showing 2 changed files with 159 additions and 0 deletions.
diff --git a/src/student_success_tool/modeling/utils.py b/src/student_success_tool/modeling/utils.py
@@ -0,0 +1,75 @@
+import typing as t
+from collections.abc import Sequence
+
+import numpy as np
+import pandas as pd
+import sklearn.utils
+
+
+def compute_dataset_splits(
+    df: pd.DataFrame,
+    *,
+    labels: Sequence[str] = ("train", "test", "valid"),
+    fracs: Sequence[float] = (0.6, 0.2, 0.2),
+    shuffle: bool = True,
+    seed: t.Optional[int] = None,
+) -> pd.Series:
+    """
+    Split input dataset into random subsets with configurable proportions;
+    by default, Databricks' standard train/test/valid splits are generated.
+
+    Args:
+        df
+        labels: Labels for each subset into which ``df`` is split.
+        fracs: Approximate proportions of each subset into which ``df`` is split;
+            corresponds 1:1 with each label in ``labels`` .
+        shuffle: Whether or not to shuffle the data before splitting.
+        seed: Optional integer used to set state for the underlying random generator;
+            specify a value for reproducible splits, otherwise each call is unique.
+
+    See Also:
+        - :func:`sklearn.model_selection.train_test_split()`
+    """
+    if len(labels) != len(fracs):
+        raise ValueError(
+            f"the number of specified labels ({len(labels)}) and fracs {len(fracs)} "
+            "must be the same"
+        )
+
+    rng = np.random.default_rng(seed=seed)
+    return pd.Series(
+        data=rng.choice(labels, size=len(df), p=fracs, shuffle=shuffle),
+        index=df.index,
+        dtype="string",
+        name="split",
+    )
+
+
+def compute_sample_weights(
+    df: pd.DataFrame,
+    *,
+    target_col: str = "target",
+    class_weight: t.Literal["balanced"] | dict[object, int] = "balanced",
+) -> pd.Series:
+    """
+    Estimate sample weights by class for imbalanced datasets.
+
+    Args:
+        df
+        target_col: Name of column in ``df`` containing class label values
+            i.e. "targets" to be predicted.
+        class_weight: Weights associated with classes in the form ``{class_label: weight}``
+            or "balanced" to automatically adjust weights inversely proportional to
+            class frequencies in the input data.
+
+    See Also:
+        - :func:`sklearn.utils.class_weight.compute_sample_weight()`
+    """
+    return pd.Series(
+        data=sklearn.utils.class_weight.compute_sample_weight(
+            class_weight, df[target_col]
+        ),
+        index=df.index,
+        dtype="float32",
+        name="sample_weight",
+    )
diff --git a/tests/modeling/test_utils.py b/tests/modeling/test_utils.py
@@ -0,0 +1,84 @@
+import pandas as pd
+import pytest
+
+from student_success_tool.modeling import utils
+
+
+@pytest.mark.parametrize(
+    ["df", "labels", "fracs", "shuffle", "seed"],
+    [
+        (
+            pd.DataFrame(data=list(range(1000))),
+            ["train", "test"],
+            [0.5, 0.5],
+            True,
+            None,
+        ),
+        (
+            pd.DataFrame(data=list(range(1000))),
+            ["train", "test", "valid"],
+            [0.6, 0.2, 0.2],
+            False,
+            None,
+        ),
+        (
+            pd.DataFrame(data=list(range(1000))),
+            ["train", "test"],
+            [0.5, 0.5],
+            True,
+            42,
+        ),
+    ],
+)
+def test_compute_dataset_splits(df, labels, fracs, shuffle, seed):
+    obs = utils.compute_dataset_splits(
+        df, labels=labels, fracs=fracs, shuffle=shuffle, seed=seed
+    )
+    assert isinstance(obs, pd.Series)
+    assert len(obs) == len(df)
+    obs_value_counts = obs.value_counts(normalize=True)
+    exp_value_counts = pd.Series(
+        data=list(fracs),
+        index=pd.Index(list(labels), dtype="string", name="split"),
+        name="proportion",
+        dtype="Float64",
+    )
+    assert (
+        pd.testing.assert_series_equal(
+            obs_value_counts, exp_value_counts, rtol=0.15, check_like=True
+        )
+        is None
+    )
+    if seed is not None:
+        obs2 = utils.compute_dataset_splits(
+            df, labels=labels, fracs=fracs, shuffle=shuffle, seed=seed
+        )
+        assert obs.equals(obs2)
+
+
+@pytest.mark.parametrize(
+    ["df", "target_col", "class_weight", "exp"],
+    [
+        (
+            pd.DataFrame({"target": [1, 1, 1, 0]}),
+            "target",
+            "balanced",
+            pd.Series(
+                [0.667, 0.667, 0.667, 2.0], dtype="float32", name="sample_weight"
+            ),
+        ),
+        (
+            pd.DataFrame({"target": [1, 1, 1, 0]}),
+            "target",
+            {1: 2, 0: 0.5},
+            pd.Series([2.0, 2.0, 2.0, 0.5], dtype="float32", name="sample_weight"),
+        ),
+    ],
+)
+def test_compute_sample_weights(df, target_col, class_weight, exp):
+    obs = utils.compute_sample_weights(
+        df, target_col=target_col, class_weight=class_weight
+    )
+    assert isinstance(obs, pd.Series)
+    assert len(obs) == len(df)
+    assert pd.testing.assert_series_equal(obs, exp, rtol=0.01) is None