diff --git a/lkauto/explicit/explicit_evaler.py b/lkauto/explicit/explicit_evaler.py index 428d874..8ea21b8 100644 --- a/lkauto/explicit/explicit_evaler.py +++ b/lkauto/explicit/explicit_evaler.py @@ -24,6 +24,8 @@ class ExplicitEvaler: LensKit prediction accuracy metric used to evaluate the model (either rmse or mae) filer : Filer filer to organize the output. + validation : pd.DataFrame + pandas dataset containing the validation split. random_state : The random number generator or seed (see :py:func:`lenskit.util.rng`). split_folds : @@ -48,6 +50,7 @@ def __init__(self, train: pd.DataFrame, optimization_metric, filer: Filer, + validation=None, random_state=42, split_folds: int = 1, split_strategie: str = 'user_based', @@ -58,6 +61,7 @@ def __init__(self, self.logger = logging.getLogger('lenskit-auto') self.train = train self.filer = filer + self.validation = validation self.random_state = random_state self.split_folds = split_folds self.optimization_metric = optimization_metric @@ -67,11 +71,14 @@ def __init__(self, self.run_id = 0 self.ensemble_size = ensemble_size self.top_n_runs = pd.DataFrame(columns=['run_id', 'model', 'error']) - self.val_fold_indices = validation_split(data=self.train, - strategie=self.split_strategie, - num_folds=self.split_folds, - frac=self.split_frac, - random_state=self.random_state) + if self.validation is None: + self.val_fold_indices = validation_split(data=self.train, + strategie=self.split_strategie, + num_folds=self.split_folds, + frac=self.split_frac, + random_state=self.random_state) + else: + self.val_fold_indices = None def evaluate(self, config_space: ConfigurationSpace) -> float: """ evaluates model defined in config_space @@ -98,10 +105,14 @@ def evaluate(self, config_space: ConfigurationSpace) -> float: model = get_model_from_cs(config_space, feedback='explicit') # loop over validation folds - for fold in range(len(self.val_fold_indices)): - # get validation split by fold index - validation_train = self.train.loc[self.val_fold_indices[fold]["train"], :] - validation_test = self.train.loc[self.val_fold_indices[fold]["validation"], :] + for fold in range(self.split_folds): + if self.validation is None: + # get validation split by fold index + validation_train = self.train.loc[self.val_fold_indices[fold]["train"], :] + validation_test = self.train.loc[self.val_fold_indices[fold]["validation"], :] + else: + validation_train = self.train + validation_test = self.validation # split validation data into X and y X_validation_test = validation_test.copy() diff --git a/lkauto/implicit/implicit_evaler.py b/lkauto/implicit/implicit_evaler.py index bd3090a..ee52f87 100644 --- a/lkauto/implicit/implicit_evaler.py +++ b/lkauto/implicit/implicit_evaler.py @@ -23,6 +23,8 @@ class ImplicitEvaler: LensKit top-n metric used to evaluate the model filer : Filer filer to organize the output. + validation : pd.DataFrame + pandas dataset containing the validation split. random_state : The random number generator or seed (see :py:func:`lenskit.util.rng`). split_folds : @@ -47,6 +49,7 @@ def __init__(self, train: pd.DataFrame, optimization_metric, filer: Filer, + validation=None, random_state=42, split_folds: int = 1, split_strategie: str = 'user_based', @@ -56,6 +59,7 @@ def __init__(self, ) -> None: self.logger = logging.getLogger('lenskit-auto') self.train = train + self.validation = validation self.optimization_metric = optimization_metric self.random_state = random_state self.split_folds = split_folds @@ -66,11 +70,14 @@ def __init__(self, self.minimize_error_metric_val = minimize_error_metric_val self.run_id = 0 # create validation split - self.val_fold_indices = validation_split(data=self.train, - strategie=self.split_strategie, - num_folds=self.split_folds, - frac=self.split_frac, - random_state=self.random_state) + if self.validation is None: + self.val_fold_indices = validation_split(data=self.train, + strategie=self.split_strategie, + num_folds=self.split_folds, + frac=self.split_frac, + random_state=self.random_state) + else: + self.val_fold_indices = None def evaluate(self, config_space: ConfigurationSpace) -> float: """ evaluates model defined in config_space @@ -98,10 +105,14 @@ def evaluate(self, config_space: ConfigurationSpace) -> float: model = get_model_from_cs(config_space, feedback='implicit') # iterate over validation folds - for fold in range(len(self.val_fold_indices)): + for fold in range(self.split_folds): # get validation split by index - validation_train = self.train.loc[self.val_fold_indices[fold]["train"], :] - validation_test = self.train.loc[self.val_fold_indices[fold]["validation"], :] + if self.validation is None: + validation_train = self.train.loc[self.val_fold_indices[fold]["train"], :] + validation_test = self.train.loc[self.val_fold_indices[fold]["validation"], :] + else: + validation_train = self.train + validation_test = self.validation # fit and recommend from configuration model = model.fit(validation_train) diff --git a/lkauto/lkauto.py b/lkauto/lkauto.py index c4790d2..4310bd4 100644 --- a/lkauto/lkauto.py +++ b/lkauto/lkauto.py @@ -20,6 +20,7 @@ def get_best_prediction_model(train: pd.DataFrame, + validation: pd.DataFrame = None, cs: ConfigurationSpace = None, optimization_metric=rmse, optimization_strategie: str = 'bayesian', @@ -60,6 +61,9 @@ def get_best_prediction_model(train: pd.DataFrame, ---------- train : pd.DataFrame Pandas Dataframe train split. + validation : pd.DataFrame + Pandas Dataframe validation split. + if a validation split is provided, split_folds, split_strategy and split_frac will be ignored. cs : ConfigurationSpace ConfigurationSpace with all algorithms and parameter ranges defined. optimization_metric : function @@ -142,6 +146,10 @@ def get_best_prediction_model(train: pd.DataFrame, logger.debug('initializing random_state') random_state = 42 + # set split_folds to 1 if validation is not None + if validation is not None: + split_folds = 1 + # preprocess data preprocess_data(data=train, user_col=user_column, @@ -159,6 +167,7 @@ def get_best_prediction_model(train: pd.DataFrame, incumbent, top_n_runs = bayesian_optimization(train=train, cs=cs, user_feedback='explicit', + validation=validation, optimization_metric=optimization_metric, time_limit_in_sec=time_limit_in_sec, num_evaluations=num_evaluations, @@ -173,6 +182,7 @@ def get_best_prediction_model(train: pd.DataFrame, incumbent, top_n_runs = random_search(train=train, cs=cs, user_feedback='explicit', + validation=validation, optimization_metric=optimization_metric, time_limit_in_sec=time_limit_in_sec, num_evaluations=num_evaluations, @@ -212,6 +222,7 @@ def get_best_prediction_model(train: pd.DataFrame, def get_best_recommender_model(train: pd.DataFrame, + validation: pd.DataFrame = None, cs: ConfigurationSpace = None, optimization_metric=ndcg, optimization_strategie: str = 'bayesian', @@ -252,6 +263,9 @@ def get_best_recommender_model(train: pd.DataFrame, ---------- train : pd.DataFrame Pandas Dataframe train split. + validation : pd.DataFrame + Pandas Dataframe validation split. + if a validation split is provided, split_folds, split_strategy and split_frac will be ignored. cs : ConfigurationSpace ConfigurationSpace with all algorithms and parameter ranges defined. optimization_strategie: str @@ -333,6 +347,10 @@ def get_best_recommender_model(train: pd.DataFrame, logger.debug('random_state is None. Initializing random_state.') random_state = 42 + # set split_folds to 1 if validation is not None + if validation is not None: + split_folds = 1 + # preprocess data preprocess_data(data=train, user_col=user_column, @@ -348,6 +366,7 @@ def get_best_recommender_model(train: pd.DataFrame, # define optimization strategie to use if optimization_strategie == 'bayesian': incumbent = bayesian_optimization(train=train, + validation=validation, cs=cs, user_feedback='implicit', optimization_metric=optimization_metric, @@ -362,6 +381,7 @@ def get_best_recommender_model(train: pd.DataFrame, filer=filer) elif optimization_strategie == 'random_search': incumbent = random_search(train=train, + validation=validation, cs=cs, user_feedback='implicit', optimization_metric=optimization_metric, diff --git a/lkauto/optimization_strategies/bayesian_optimization.py b/lkauto/optimization_strategies/bayesian_optimization.py index 032f0de..c4a82e4 100644 --- a/lkauto/optimization_strategies/bayesian_optimization.py +++ b/lkauto/optimization_strategies/bayesian_optimization.py @@ -15,6 +15,7 @@ def bayesian_optimization(train: pd.DataFrame, user_feedback: str, + validation: pd.DataFrame = None, cs: ConfigurationSpace = None, optimization_metric=None, time_limit_in_sec: int = 2700, @@ -38,6 +39,8 @@ def bayesian_optimization(train: pd.DataFrame, ---------- train : pd.DataFrame Pandas Dataframe outer train split. + validation : pd.DataFrame + Pandas Dataframe validation split. cs : ConfigurationSpace ConfigurationSpace with all algorithms and hyperparameter ranges defined. time_limit_in_sec : int @@ -82,6 +85,7 @@ def bayesian_optimization(train: pd.DataFrame, # initialize Evaler for SMAC evaluations if user_feedback == 'explicit': evaler = ExplicitEvaler(train=train, + validation=validation, optimization_metric=optimization_metric, filer=filer, random_state=random_state, @@ -92,6 +96,7 @@ def bayesian_optimization(train: pd.DataFrame, minimize_error_metric_val=minimize_error_metric_val) elif user_feedback == 'implicit': evaler = ImplicitEvaler(train=train, + validation=validation, optimization_metric=optimization_metric, filer=filer, random_state=random_state, @@ -108,6 +113,7 @@ def bayesian_optimization(train: pd.DataFrame, logger.debug('initializing default ConfigurationSpace') cs = get_default_configuration_space(data=train, val_fold_indices=evaler.val_fold_indices, + validation=validation, feedback='explicit', random_state=random_state) diff --git a/lkauto/optimization_strategies/random_search.py b/lkauto/optimization_strategies/random_search.py index 655791d..5571b99 100644 --- a/lkauto/optimization_strategies/random_search.py +++ b/lkauto/optimization_strategies/random_search.py @@ -19,6 +19,7 @@ def random_search(cs: ConfigurationSpace, user_feedback: str, optimization_metric, filer: Filer, + validation: pd.DataFrame = None, time_limit_in_sec: int = 3600, num_evaluations: int = None, split_folds: int = 1, @@ -52,6 +53,8 @@ def random_search(cs: ConfigurationSpace, random_state: int filer : Filer filer to manage LensKit-Auto output + validation : pd.DataFrame + Pandas Dataframe validation split. time_limit_in_sec time limit in seconds for the optimization process split_folds : int @@ -89,6 +92,7 @@ def random_search(cs: ConfigurationSpace, evaler = ExplicitEvaler(train=train, optimization_metric=optimization_metric, filer=filer, + validation=validation, random_state=random_state, split_folds=split_folds, split_strategie=split_strategie, @@ -99,6 +103,7 @@ def random_search(cs: ConfigurationSpace, evaler = ImplicitEvaler(train=train, optimization_metric=optimization_metric, filer=filer, + validation=validation, random_state=random_state, split_folds=split_folds, split_strategie=split_strategie, @@ -113,6 +118,7 @@ def random_search(cs: ConfigurationSpace, logger.debug('initializing default ConfigurationSpace') cs = get_default_configuration_space(data=train, val_fold_indices=evaler.val_fold_indices, + validation=validation, feedback='explicit', random_state=random_state) diff --git a/lkauto/utils/get_default_configuration_space.py b/lkauto/utils/get_default_configuration_space.py index 97b042b..69b9364 100644 --- a/lkauto/utils/get_default_configuration_space.py +++ b/lkauto/utils/get_default_configuration_space.py @@ -14,6 +14,7 @@ def get_default_configuration_space(data: pd.DataFrame, val_fold_indices, feedback: str, + validation: pd.DataFrame = None, random_state=42) -> ConfigurationSpace: """ returns the default configuration space for all included rating predictions algorithms @@ -24,6 +25,8 @@ def get_default_configuration_space(data: pd.DataFrame, data to use val_fold_indices validation fold indices + validation: pd.DataFrame + validation data (provided by user) feedback : str feedback type, either 'explicit' or 'implicit' random_state: int @@ -38,15 +41,22 @@ def get_default_configuration_space(data: pd.DataFrame, raise ValueError("Unknown feedback type: {}".format(feedback)) # get minimum number of items and users for the given train split - val_fold_indices = val_fold_indices + num_items = 0 num_users = 0 - for fold in range(len(val_fold_indices)): - tmp = data.loc[val_fold_indices[fold]["train"], :] - if tmp['item'].nunique() < num_items or num_items == 0: - num_items = tmp['item'].nunique() - if tmp['user'].nunique() < num_users or num_users == 0: - num_users = tmp['user'].nunique() + if validation is None: + val_fold_indices = val_fold_indices + for fold in range(len(val_fold_indices)): + tmp = data.loc[val_fold_indices[fold]["train"], :] + if tmp['item'].nunique() < num_items or num_items == 0: + num_items = tmp['item'].nunique() + if tmp['user'].nunique() < num_users or num_users == 0: + num_users = tmp['user'].nunique() + else: + if data['item'].nunique() < num_items or num_items == 0: + num_items = data['item'].nunique() + if data['user'].nunique() < num_users or num_users == 0: + num_users = data['user'].nunique() # define configuration space cs = ConfigurationSpace(