Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add option to choose which fold to use as a final predictor #614

Open
wants to merge 7 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions requirements_dev.txt
Original file line number Diff line number Diff line change
@@ -1,4 +1,6 @@
pytest==5.3.5
black==19.3b0
click==8.0.2
ipython
pytest-cov
coveralls
2 changes: 2 additions & 0 deletions supervised/automl.py
Original file line number Diff line number Diff line change
Expand Up @@ -71,6 +71,7 @@ def __init__(
n_jobs: int = -1,
verbose: int = 1,
random_state: int = 1234,
chosen_fold: Optional[int] = None,
):
"""
Initialize `AutoML` object.
Expand Down Expand Up @@ -342,6 +343,7 @@ def __init__(
self.optuna_verbose = optuna_verbose
self.n_jobs = n_jobs
self.random_state = random_state
self.chosen_fold = chosen_fold

def fit(
self,
Expand Down
32 changes: 21 additions & 11 deletions supervised/base_automl.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,7 @@
from supervised.utils.metric import Metric
from supervised.utils.metric import UserDefinedEvalMetric
from supervised.utils.automl_plots import AutoMLPlots

# disable EDA
# from supervised.preprocessing.eda import EDA
from supervised.preprocessing.preprocessing_utils import PreprocessingUtils
Expand Down Expand Up @@ -116,6 +117,7 @@ def __init__(self):
self._optuna_init_params = {}
self._optuna_verbose = True
self._n_jobs = -1
self._chosen_fold = None

def _get_tuner_params(
self, start_random_models, hill_climbing_steps, top_models_to_improve
Expand Down Expand Up @@ -181,6 +183,7 @@ def load(self, path):
)
self._n_jobs = params.get("n_jobs", self._n_jobs)
self._random_state = params.get("random_state", self._random_state)
self._chosen_fold = params.get("chosen_fold", self._chosen_fold)
stacked_models = params.get("stacked")

best_model_name = params.get("best_model")
Expand Down Expand Up @@ -368,10 +371,7 @@ def train_model(self, params):
)

# create model framework
mf = ModelFramework(
params,
callbacks=[early_stop, total_time_constraint],
)
mf = ModelFramework(params, callbacks=[early_stop, total_time_constraint])

# start training
logger.info(
Expand Down Expand Up @@ -930,6 +930,7 @@ def _fit(self, X, y, sample_weight=None, cv=None):
self._optuna_verbose = self._get_optuna_verbose()
self._n_jobs = self._get_n_jobs()
self._random_state = self._get_random_state()
self._chosen_fold = self._get_chosen_fold()

self._adjust_validation = False
self._apply_constraints()
Expand Down Expand Up @@ -1159,10 +1160,7 @@ def select_and_save_best(self, show_warnings=False):
if m.is_valid() and m.is_fast_enough(self._max_single_prediction_time)
]
if model_list:
self._best_model = min(
model_list,
key=lambda x: x.get_final_loss(),
)
self._best_model = min(model_list, key=lambda x: x.get_final_loss())
# if none selected please select again and warn the user
if (
len(self._models)
Expand Down Expand Up @@ -1211,6 +1209,7 @@ def select_and_save_best(self, show_warnings=False):
"random_state": self._random_state,
"saved": self._model_subpaths,
"fit_level": self._fit_level,
"chosen_fold": self._chosen_fold,
}
if self._best_model is not None:
params["best_model"] = self._best_model.get_name()
Expand Down Expand Up @@ -1327,11 +1326,11 @@ def _base_predict(self, X, model=None):

if model.get_type() == "Ensemble":
# Ensemble is using both original and stacked data
predictions = model.predict(X, X_stacked)
predictions = model.predict(X, X_stacked, self._chosen_fold)
else:
predictions = model.predict(X_stacked)
predictions = model.predict(X_stacked, self._chosen_fold)
else:
predictions = model.predict(X)
predictions = model.predict(X, self._chosen_fold)

if self._ml_task == BINARY_CLASSIFICATION:
# need to predict the label based on predictions and threshold
Expand Down Expand Up @@ -1790,6 +1789,11 @@ def _get_random_state(self):
"""Gets the current random_state"""
self._validate_random_state()
return deepcopy(self.random_state)

def _get_chosen_fold(self):
"""Gets the current chosen_fold"""
self._validate_chosen_fold()
return deepcopy(self.chosen_fold)

def _validate_mode(self):
"""Validates mode parameter"""
Expand Down Expand Up @@ -2031,6 +2035,12 @@ def _validate_n_jobs(self):
def _validate_random_state(self):
"""Validates random_state parameter"""
check_positive_integer(self.random_state, "random_state")

def _validate_chosen_fold(self):
"""Validates chosen_fold parameter"""
if self.chosen_fold is None:
return
check_integer(self.chosen_fold, "chosen_fold")

def to_json(self):
if self._best_model is None:
Expand Down
8 changes: 5 additions & 3 deletions supervised/ensemble.py
Original file line number Diff line number Diff line change
Expand Up @@ -290,7 +290,7 @@ def fit(self, oofs, y, sample_weight=None):

self.train_time = time.time() - start_time

def predict(self, X, X_stacked=None):
def predict(self, X, X_stacked=None, chosen_fold=None):
logger.debug(
"Ensemble.predict with {} models".format(len(self.selected_models))
)
Expand All @@ -303,9 +303,11 @@ def predict(self, X, X_stacked=None):
total_repeat += repeat

if model._is_stacked:
y_predicted_from_model = model.predict(X_stacked)
y_predicted_from_model = model.predict(
X_stacked, chosen_fold=chosen_fold
)
else:
y_predicted_from_model = model.predict(X)
y_predicted_from_model = model.predict(X, chosen_fold=chosen_fold)

prediction_cols = []
if self._ml_task in [BINARY_CLASSIFICATION, MULTICLASS_CLASSIFICATION]:
Expand Down
34 changes: 24 additions & 10 deletions supervised/model_framework.py
Original file line number Diff line number Diff line change
Expand Up @@ -416,26 +416,40 @@ def is_fast_enough(self, max_single_prediction_time):

return self._single_prediction_time < max_single_prediction_time

def predict(self, X):
def predict(self, X, chosen_fold=None):
logger.debug("ModelFramework.predict")

if self.learners is None or len(self.learners) == 0:
raise Exception("Learnes are not initialized")
# run predict on all learners and return the average
y_predicted = None # np.zeros((X.shape[0],))
for ind, learner in enumerate(self.learners):

# If no specific fold is chosen, return the average prediction across all folds
if chosen_fold is None:
for ind, learner in enumerate(self.learners):
# preprocessing goes here
X_data, _, _ = self.preprocessings[ind].transform(X.copy(), None)
y_p = learner.predict(X_data)
y_p = self.preprocessings[ind].inverse_scale_target(y_p)

y_predicted = y_p if y_predicted is None else y_predicted + y_p

y_predicted_average = y_predicted / float(len(self.learners))

y_predicted_final = self.preprocessings[0].prepare_target_labels(
y_predicted_average
)
else:
ind = chosen_fold
learner = self.learners[ind]
# preprocessing goes here
X_data, _, _ = self.preprocessings[ind].transform(X.copy(), None)
y_p = learner.predict(X_data)
y_p = self.preprocessings[ind].inverse_scale_target(y_p)

y_predicted = y_p if y_predicted is None else y_predicted + y_p

y_predicted_average = y_predicted / float(len(self.learners))
y_predicted = self.preprocessings[ind].inverse_scale_target(y_p)

y_predicted_final = self.preprocessings[0].prepare_target_labels(
y_predicted_average
)
y_predicted_final = self.preprocessings[0].prepare_target_labels(
y_predicted
)

return y_predicted_final

Expand Down
52 changes: 52 additions & 0 deletions tests/tests_automl/test_chosen_fold.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
import os
import unittest
import tempfile
import json
import numpy as np
import pandas as pd
import shutil

from supervised import AutoML
from supervised.exceptions import AutoMLException


class AutoMLChosenFoldTest(unittest.TestCase):

automl_dir = "automl_testing"

def tearDown(self):
shutil.rmtree(self.automl_dir, ignore_errors=True)

def test_chosen_fold(self):

X = np.random.uniform(size=(60, 2))
y = np.random.randint(0, 2, size=(60,))

automl = AutoML(
results_path=self.automl_dir,
model_time_limit=10,
algorithms=["Xgboost"],
mode="Compete",
explain_level=0,
validation_strategy={
"validation_type": "kfold",
"k_folds": 5,
"shuffle": True,
"random_seed": 123,
},
start_random_models=1,
hill_climbing_steps=0,
top_models_to_improve=0,
kmeans_features=False,
golden_features=False,
features_selection=False,
boost_on_errors=False,
chosen_fold=-1,
)
automl.fit(X, y)
automl.predict_proba(X)
automl.predict(X)

self.assertFalse(
os.path.exists(os.path.join(self.automl_dir, "1_DecisionTree"))
)