Skip to content

Commit

Permalink
Merge pull request #41 from datakind/pdp-extend-modeling-functionality
Browse files Browse the repository at this point in the history
[pdp] Extend modeling functionality
  • Loading branch information
bdewilde authored Dec 17, 2024
2 parents a776967 + 992401b commit 8970ef8
Show file tree
Hide file tree
Showing 2 changed files with 159 additions and 0 deletions.
75 changes: 75 additions & 0 deletions src/student_success_tool/modeling/utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,75 @@
import typing as t
from collections.abc import Sequence

import numpy as np
import pandas as pd
import sklearn.utils


def compute_dataset_splits(
df: pd.DataFrame,
*,
labels: Sequence[str] = ("train", "test", "valid"),
fracs: Sequence[float] = (0.6, 0.2, 0.2),
shuffle: bool = True,
seed: t.Optional[int] = None,
) -> pd.Series:
"""
Split input dataset into random subsets with configurable proportions;
by default, Databricks' standard train/test/valid splits are generated.
Args:
df
labels: Labels for each subset into which ``df`` is split.
fracs: Approximate proportions of each subset into which ``df`` is split;
corresponds 1:1 with each label in ``labels`` .
shuffle: Whether or not to shuffle the data before splitting.
seed: Optional integer used to set state for the underlying random generator;
specify a value for reproducible splits, otherwise each call is unique.
See Also:
- :func:`sklearn.model_selection.train_test_split()`
"""
if len(labels) != len(fracs):
raise ValueError(
f"the number of specified labels ({len(labels)}) and fracs {len(fracs)} "
"must be the same"
)

rng = np.random.default_rng(seed=seed)
return pd.Series(
data=rng.choice(labels, size=len(df), p=fracs, shuffle=shuffle),
index=df.index,
dtype="string",
name="split",
)


def compute_sample_weights(
df: pd.DataFrame,
*,
target_col: str = "target",
class_weight: t.Literal["balanced"] | dict[object, int] = "balanced",
) -> pd.Series:
"""
Estimate sample weights by class for imbalanced datasets.
Args:
df
target_col: Name of column in ``df`` containing class label values
i.e. "targets" to be predicted.
class_weight: Weights associated with classes in the form ``{class_label: weight}``
or "balanced" to automatically adjust weights inversely proportional to
class frequencies in the input data.
See Also:
- :func:`sklearn.utils.class_weight.compute_sample_weight()`
"""
return pd.Series(
data=sklearn.utils.class_weight.compute_sample_weight(
class_weight, df[target_col]
),
index=df.index,
dtype="float32",
name="sample_weight",
)
84 changes: 84 additions & 0 deletions tests/modeling/test_utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,84 @@
import pandas as pd
import pytest

from student_success_tool.modeling import utils


@pytest.mark.parametrize(
["df", "labels", "fracs", "shuffle", "seed"],
[
(
pd.DataFrame(data=list(range(1000))),
["train", "test"],
[0.5, 0.5],
True,
None,
),
(
pd.DataFrame(data=list(range(1000))),
["train", "test", "valid"],
[0.6, 0.2, 0.2],
False,
None,
),
(
pd.DataFrame(data=list(range(1000))),
["train", "test"],
[0.5, 0.5],
True,
42,
),
],
)
def test_compute_dataset_splits(df, labels, fracs, shuffle, seed):
obs = utils.compute_dataset_splits(
df, labels=labels, fracs=fracs, shuffle=shuffle, seed=seed
)
assert isinstance(obs, pd.Series)
assert len(obs) == len(df)
obs_value_counts = obs.value_counts(normalize=True)
exp_value_counts = pd.Series(
data=list(fracs),
index=pd.Index(list(labels), dtype="string", name="split"),
name="proportion",
dtype="Float64",
)
assert (
pd.testing.assert_series_equal(
obs_value_counts, exp_value_counts, rtol=0.15, check_like=True
)
is None
)
if seed is not None:
obs2 = utils.compute_dataset_splits(
df, labels=labels, fracs=fracs, shuffle=shuffle, seed=seed
)
assert obs.equals(obs2)


@pytest.mark.parametrize(
["df", "target_col", "class_weight", "exp"],
[
(
pd.DataFrame({"target": [1, 1, 1, 0]}),
"target",
"balanced",
pd.Series(
[0.667, 0.667, 0.667, 2.0], dtype="float32", name="sample_weight"
),
),
(
pd.DataFrame({"target": [1, 1, 1, 0]}),
"target",
{1: 2, 0: 0.5},
pd.Series([2.0, 2.0, 2.0, 0.5], dtype="float32", name="sample_weight"),
),
],
)
def test_compute_sample_weights(df, target_col, class_weight, exp):
obs = utils.compute_sample_weights(
df, target_col=target_col, class_weight=class_weight
)
assert isinstance(obs, pd.Series)
assert len(obs) == len(df)
assert pd.testing.assert_series_equal(obs, exp, rtol=0.01) is None

0 comments on commit 8970ef8

Please sign in to comment.