-
Notifications
You must be signed in to change notification settings - Fork 1
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #41 from datakind/pdp-extend-modeling-functionality
[pdp] Extend modeling functionality
- Loading branch information
Showing
2 changed files
with
159 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,75 @@ | ||
import typing as t | ||
from collections.abc import Sequence | ||
|
||
import numpy as np | ||
import pandas as pd | ||
import sklearn.utils | ||
|
||
|
||
def compute_dataset_splits( | ||
df: pd.DataFrame, | ||
*, | ||
labels: Sequence[str] = ("train", "test", "valid"), | ||
fracs: Sequence[float] = (0.6, 0.2, 0.2), | ||
shuffle: bool = True, | ||
seed: t.Optional[int] = None, | ||
) -> pd.Series: | ||
""" | ||
Split input dataset into random subsets with configurable proportions; | ||
by default, Databricks' standard train/test/valid splits are generated. | ||
Args: | ||
df | ||
labels: Labels for each subset into which ``df`` is split. | ||
fracs: Approximate proportions of each subset into which ``df`` is split; | ||
corresponds 1:1 with each label in ``labels`` . | ||
shuffle: Whether or not to shuffle the data before splitting. | ||
seed: Optional integer used to set state for the underlying random generator; | ||
specify a value for reproducible splits, otherwise each call is unique. | ||
See Also: | ||
- :func:`sklearn.model_selection.train_test_split()` | ||
""" | ||
if len(labels) != len(fracs): | ||
raise ValueError( | ||
f"the number of specified labels ({len(labels)}) and fracs {len(fracs)} " | ||
"must be the same" | ||
) | ||
|
||
rng = np.random.default_rng(seed=seed) | ||
return pd.Series( | ||
data=rng.choice(labels, size=len(df), p=fracs, shuffle=shuffle), | ||
index=df.index, | ||
dtype="string", | ||
name="split", | ||
) | ||
|
||
|
||
def compute_sample_weights( | ||
df: pd.DataFrame, | ||
*, | ||
target_col: str = "target", | ||
class_weight: t.Literal["balanced"] | dict[object, int] = "balanced", | ||
) -> pd.Series: | ||
""" | ||
Estimate sample weights by class for imbalanced datasets. | ||
Args: | ||
df | ||
target_col: Name of column in ``df`` containing class label values | ||
i.e. "targets" to be predicted. | ||
class_weight: Weights associated with classes in the form ``{class_label: weight}`` | ||
or "balanced" to automatically adjust weights inversely proportional to | ||
class frequencies in the input data. | ||
See Also: | ||
- :func:`sklearn.utils.class_weight.compute_sample_weight()` | ||
""" | ||
return pd.Series( | ||
data=sklearn.utils.class_weight.compute_sample_weight( | ||
class_weight, df[target_col] | ||
), | ||
index=df.index, | ||
dtype="float32", | ||
name="sample_weight", | ||
) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,84 @@ | ||
import pandas as pd | ||
import pytest | ||
|
||
from student_success_tool.modeling import utils | ||
|
||
|
||
@pytest.mark.parametrize( | ||
["df", "labels", "fracs", "shuffle", "seed"], | ||
[ | ||
( | ||
pd.DataFrame(data=list(range(1000))), | ||
["train", "test"], | ||
[0.5, 0.5], | ||
True, | ||
None, | ||
), | ||
( | ||
pd.DataFrame(data=list(range(1000))), | ||
["train", "test", "valid"], | ||
[0.6, 0.2, 0.2], | ||
False, | ||
None, | ||
), | ||
( | ||
pd.DataFrame(data=list(range(1000))), | ||
["train", "test"], | ||
[0.5, 0.5], | ||
True, | ||
42, | ||
), | ||
], | ||
) | ||
def test_compute_dataset_splits(df, labels, fracs, shuffle, seed): | ||
obs = utils.compute_dataset_splits( | ||
df, labels=labels, fracs=fracs, shuffle=shuffle, seed=seed | ||
) | ||
assert isinstance(obs, pd.Series) | ||
assert len(obs) == len(df) | ||
obs_value_counts = obs.value_counts(normalize=True) | ||
exp_value_counts = pd.Series( | ||
data=list(fracs), | ||
index=pd.Index(list(labels), dtype="string", name="split"), | ||
name="proportion", | ||
dtype="Float64", | ||
) | ||
assert ( | ||
pd.testing.assert_series_equal( | ||
obs_value_counts, exp_value_counts, rtol=0.15, check_like=True | ||
) | ||
is None | ||
) | ||
if seed is not None: | ||
obs2 = utils.compute_dataset_splits( | ||
df, labels=labels, fracs=fracs, shuffle=shuffle, seed=seed | ||
) | ||
assert obs.equals(obs2) | ||
|
||
|
||
@pytest.mark.parametrize( | ||
["df", "target_col", "class_weight", "exp"], | ||
[ | ||
( | ||
pd.DataFrame({"target": [1, 1, 1, 0]}), | ||
"target", | ||
"balanced", | ||
pd.Series( | ||
[0.667, 0.667, 0.667, 2.0], dtype="float32", name="sample_weight" | ||
), | ||
), | ||
( | ||
pd.DataFrame({"target": [1, 1, 1, 0]}), | ||
"target", | ||
{1: 2, 0: 0.5}, | ||
pd.Series([2.0, 2.0, 2.0, 0.5], dtype="float32", name="sample_weight"), | ||
), | ||
], | ||
) | ||
def test_compute_sample_weights(df, target_col, class_weight, exp): | ||
obs = utils.compute_sample_weights( | ||
df, target_col=target_col, class_weight=class_weight | ||
) | ||
assert isinstance(obs, pd.Series) | ||
assert len(obs) == len(df) | ||
assert pd.testing.assert_series_equal(obs, exp, rtol=0.01) is None |