Skip to content

Commit

Permalink
Merge remote-tracking branch 'origin/main'
Browse files Browse the repository at this point in the history
  • Loading branch information
tobiasvente committed Oct 9, 2023
2 parents 1e7ae3a + ba2c0be commit a461a36
Show file tree
Hide file tree
Showing 6 changed files with 88 additions and 24 deletions.
29 changes: 20 additions & 9 deletions lkauto/explicit/explicit_evaler.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,8 @@ class ExplicitEvaler:
LensKit prediction accuracy metric used to evaluate the model (either rmse or mae)
filer : Filer
filer to organize the output.
validation : pd.DataFrame
pandas dataset containing the validation split.
random_state :
The random number generator or seed (see :py:func:`lenskit.util.rng`).
split_folds :
Expand All @@ -48,6 +50,7 @@ def __init__(self,
train: pd.DataFrame,
optimization_metric,
filer: Filer,
validation=None,
random_state=42,
split_folds: int = 1,
split_strategie: str = 'user_based',
Expand All @@ -58,6 +61,7 @@ def __init__(self,
self.logger = logging.getLogger('lenskit-auto')
self.train = train
self.filer = filer
self.validation = validation
self.random_state = random_state
self.split_folds = split_folds
self.optimization_metric = optimization_metric
Expand All @@ -67,11 +71,14 @@ def __init__(self,
self.run_id = 0
self.ensemble_size = ensemble_size
self.top_n_runs = pd.DataFrame(columns=['run_id', 'model', 'error'])
self.val_fold_indices = validation_split(data=self.train,
strategie=self.split_strategie,
num_folds=self.split_folds,
frac=self.split_frac,
random_state=self.random_state)
if self.validation is None:
self.val_fold_indices = validation_split(data=self.train,
strategie=self.split_strategie,
num_folds=self.split_folds,
frac=self.split_frac,
random_state=self.random_state)
else:
self.val_fold_indices = None

def evaluate(self, config_space: ConfigurationSpace) -> float:
""" evaluates model defined in config_space
Expand All @@ -98,10 +105,14 @@ def evaluate(self, config_space: ConfigurationSpace) -> float:
model = get_model_from_cs(config_space, feedback='explicit')

# loop over validation folds
for fold in range(len(self.val_fold_indices)):
# get validation split by fold index
validation_train = self.train.loc[self.val_fold_indices[fold]["train"], :]
validation_test = self.train.loc[self.val_fold_indices[fold]["validation"], :]
for fold in range(self.split_folds):
if self.validation is None:
# get validation split by fold index
validation_train = self.train.loc[self.val_fold_indices[fold]["train"], :]
validation_test = self.train.loc[self.val_fold_indices[fold]["validation"], :]
else:
validation_train = self.train
validation_test = self.validation

# split validation data into X and y
X_validation_test = validation_test.copy()
Expand Down
27 changes: 19 additions & 8 deletions lkauto/implicit/implicit_evaler.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,8 @@ class ImplicitEvaler:
LensKit top-n metric used to evaluate the model
filer : Filer
filer to organize the output.
validation : pd.DataFrame
pandas dataset containing the validation split.
random_state :
The random number generator or seed (see :py:func:`lenskit.util.rng`).
split_folds :
Expand All @@ -47,6 +49,7 @@ def __init__(self,
train: pd.DataFrame,
optimization_metric,
filer: Filer,
validation=None,
random_state=42,
split_folds: int = 1,
split_strategie: str = 'user_based',
Expand All @@ -56,6 +59,7 @@ def __init__(self,
) -> None:
self.logger = logging.getLogger('lenskit-auto')
self.train = train
self.validation = validation
self.optimization_metric = optimization_metric
self.random_state = random_state
self.split_folds = split_folds
Expand All @@ -66,11 +70,14 @@ def __init__(self,
self.minimize_error_metric_val = minimize_error_metric_val
self.run_id = 0
# create validation split
self.val_fold_indices = validation_split(data=self.train,
strategie=self.split_strategie,
num_folds=self.split_folds,
frac=self.split_frac,
random_state=self.random_state)
if self.validation is None:
self.val_fold_indices = validation_split(data=self.train,
strategie=self.split_strategie,
num_folds=self.split_folds,
frac=self.split_frac,
random_state=self.random_state)
else:
self.val_fold_indices = None

def evaluate(self, config_space: ConfigurationSpace) -> float:
""" evaluates model defined in config_space
Expand Down Expand Up @@ -98,10 +105,14 @@ def evaluate(self, config_space: ConfigurationSpace) -> float:
model = get_model_from_cs(config_space, feedback='implicit')

# iterate over validation folds
for fold in range(len(self.val_fold_indices)):
for fold in range(self.split_folds):
# get validation split by index
validation_train = self.train.loc[self.val_fold_indices[fold]["train"], :]
validation_test = self.train.loc[self.val_fold_indices[fold]["validation"], :]
if self.validation is None:
validation_train = self.train.loc[self.val_fold_indices[fold]["train"], :]
validation_test = self.train.loc[self.val_fold_indices[fold]["validation"], :]
else:
validation_train = self.train
validation_test = self.validation

# fit and recommend from configuration
model = model.fit(validation_train)
Expand Down
20 changes: 20 additions & 0 deletions lkauto/lkauto.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@


def get_best_prediction_model(train: pd.DataFrame,
validation: pd.DataFrame = None,
cs: ConfigurationSpace = None,
optimization_metric=rmse,
optimization_strategie: str = 'bayesian',
Expand Down Expand Up @@ -60,6 +61,9 @@ def get_best_prediction_model(train: pd.DataFrame,
----------
train : pd.DataFrame
Pandas Dataframe train split.
validation : pd.DataFrame
Pandas Dataframe validation split.
if a validation split is provided, split_folds, split_strategy and split_frac will be ignored.
cs : ConfigurationSpace
ConfigurationSpace with all algorithms and parameter ranges defined.
optimization_metric : function
Expand Down Expand Up @@ -142,6 +146,10 @@ def get_best_prediction_model(train: pd.DataFrame,
logger.debug('initializing random_state')
random_state = 42

# set split_folds to 1 if validation is not None
if validation is not None:
split_folds = 1

# preprocess data
preprocess_data(data=train,
user_col=user_column,
Expand All @@ -159,6 +167,7 @@ def get_best_prediction_model(train: pd.DataFrame,
incumbent, top_n_runs = bayesian_optimization(train=train,
cs=cs,
user_feedback='explicit',
validation=validation,
optimization_metric=optimization_metric,
time_limit_in_sec=time_limit_in_sec,
num_evaluations=num_evaluations,
Expand All @@ -173,6 +182,7 @@ def get_best_prediction_model(train: pd.DataFrame,
incumbent, top_n_runs = random_search(train=train,
cs=cs,
user_feedback='explicit',
validation=validation,
optimization_metric=optimization_metric,
time_limit_in_sec=time_limit_in_sec,
num_evaluations=num_evaluations,
Expand Down Expand Up @@ -212,6 +222,7 @@ def get_best_prediction_model(train: pd.DataFrame,


def get_best_recommender_model(train: pd.DataFrame,
validation: pd.DataFrame = None,
cs: ConfigurationSpace = None,
optimization_metric=ndcg,
optimization_strategie: str = 'bayesian',
Expand Down Expand Up @@ -252,6 +263,9 @@ def get_best_recommender_model(train: pd.DataFrame,
----------
train : pd.DataFrame
Pandas Dataframe train split.
validation : pd.DataFrame
Pandas Dataframe validation split.
if a validation split is provided, split_folds, split_strategy and split_frac will be ignored.
cs : ConfigurationSpace
ConfigurationSpace with all algorithms and parameter ranges defined.
optimization_strategie: str
Expand Down Expand Up @@ -333,6 +347,10 @@ def get_best_recommender_model(train: pd.DataFrame,
logger.debug('random_state is None. Initializing random_state.')
random_state = 42

# set split_folds to 1 if validation is not None
if validation is not None:
split_folds = 1

# preprocess data
preprocess_data(data=train,
user_col=user_column,
Expand All @@ -348,6 +366,7 @@ def get_best_recommender_model(train: pd.DataFrame,
# define optimization strategie to use
if optimization_strategie == 'bayesian':
incumbent = bayesian_optimization(train=train,
validation=validation,
cs=cs,
user_feedback='implicit',
optimization_metric=optimization_metric,
Expand All @@ -362,6 +381,7 @@ def get_best_recommender_model(train: pd.DataFrame,
filer=filer)
elif optimization_strategie == 'random_search':
incumbent = random_search(train=train,
validation=validation,
cs=cs,
user_feedback='implicit',
optimization_metric=optimization_metric,
Expand Down
6 changes: 6 additions & 0 deletions lkauto/optimization_strategies/bayesian_optimization.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@

def bayesian_optimization(train: pd.DataFrame,
user_feedback: str,
validation: pd.DataFrame = None,
cs: ConfigurationSpace = None,
optimization_metric=None,
time_limit_in_sec: int = 2700,
Expand All @@ -38,6 +39,8 @@ def bayesian_optimization(train: pd.DataFrame,
----------
train : pd.DataFrame
Pandas Dataframe outer train split.
validation : pd.DataFrame
Pandas Dataframe validation split.
cs : ConfigurationSpace
ConfigurationSpace with all algorithms and hyperparameter ranges defined.
time_limit_in_sec : int
Expand Down Expand Up @@ -82,6 +85,7 @@ def bayesian_optimization(train: pd.DataFrame,
# initialize Evaler for SMAC evaluations
if user_feedback == 'explicit':
evaler = ExplicitEvaler(train=train,
validation=validation,
optimization_metric=optimization_metric,
filer=filer,
random_state=random_state,
Expand All @@ -92,6 +96,7 @@ def bayesian_optimization(train: pd.DataFrame,
minimize_error_metric_val=minimize_error_metric_val)
elif user_feedback == 'implicit':
evaler = ImplicitEvaler(train=train,
validation=validation,
optimization_metric=optimization_metric,
filer=filer,
random_state=random_state,
Expand All @@ -108,6 +113,7 @@ def bayesian_optimization(train: pd.DataFrame,
logger.debug('initializing default ConfigurationSpace')
cs = get_default_configuration_space(data=train,
val_fold_indices=evaler.val_fold_indices,
validation=validation,
feedback='explicit',
random_state=random_state)

Expand Down
6 changes: 6 additions & 0 deletions lkauto/optimization_strategies/random_search.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ def random_search(cs: ConfigurationSpace,
user_feedback: str,
optimization_metric,
filer: Filer,
validation: pd.DataFrame = None,
time_limit_in_sec: int = 3600,
num_evaluations: int = None,
split_folds: int = 1,
Expand Down Expand Up @@ -52,6 +53,8 @@ def random_search(cs: ConfigurationSpace,
random_state: int
filer : Filer
filer to manage LensKit-Auto output
validation : pd.DataFrame
Pandas Dataframe validation split.
time_limit_in_sec
time limit in seconds for the optimization process
split_folds : int
Expand Down Expand Up @@ -89,6 +92,7 @@ def random_search(cs: ConfigurationSpace,
evaler = ExplicitEvaler(train=train,
optimization_metric=optimization_metric,
filer=filer,
validation=validation,
random_state=random_state,
split_folds=split_folds,
split_strategie=split_strategie,
Expand All @@ -99,6 +103,7 @@ def random_search(cs: ConfigurationSpace,
evaler = ImplicitEvaler(train=train,
optimization_metric=optimization_metric,
filer=filer,
validation=validation,
random_state=random_state,
split_folds=split_folds,
split_strategie=split_strategie,
Expand All @@ -113,6 +118,7 @@ def random_search(cs: ConfigurationSpace,
logger.debug('initializing default ConfigurationSpace')
cs = get_default_configuration_space(data=train,
val_fold_indices=evaler.val_fold_indices,
validation=validation,
feedback='explicit',
random_state=random_state)

Expand Down
24 changes: 17 additions & 7 deletions lkauto/utils/get_default_configuration_space.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@
def get_default_configuration_space(data: pd.DataFrame,
val_fold_indices,
feedback: str,
validation: pd.DataFrame = None,
random_state=42) -> ConfigurationSpace:
"""
returns the default configuration space for all included rating predictions algorithms
Expand All @@ -24,6 +25,8 @@ def get_default_configuration_space(data: pd.DataFrame,
data to use
val_fold_indices
validation fold indices
validation: pd.DataFrame
validation data (provided by user)
feedback : str
feedback type, either 'explicit' or 'implicit'
random_state: int
Expand All @@ -38,15 +41,22 @@ def get_default_configuration_space(data: pd.DataFrame,
raise ValueError("Unknown feedback type: {}".format(feedback))

# get minimum number of items and users for the given train split
val_fold_indices = val_fold_indices

num_items = 0
num_users = 0
for fold in range(len(val_fold_indices)):
tmp = data.loc[val_fold_indices[fold]["train"], :]
if tmp['item'].nunique() < num_items or num_items == 0:
num_items = tmp['item'].nunique()
if tmp['user'].nunique() < num_users or num_users == 0:
num_users = tmp['user'].nunique()
if validation is None:
val_fold_indices = val_fold_indices
for fold in range(len(val_fold_indices)):
tmp = data.loc[val_fold_indices[fold]["train"], :]
if tmp['item'].nunique() < num_items or num_items == 0:
num_items = tmp['item'].nunique()
if tmp['user'].nunique() < num_users or num_users == 0:
num_users = tmp['user'].nunique()
else:
if data['item'].nunique() < num_items or num_items == 0:
num_items = data['item'].nunique()
if data['user'].nunique() < num_users or num_users == 0:
num_users = data['user'].nunique()

# define configuration space
cs = ConfigurationSpace(
Expand Down

0 comments on commit a461a36

Please sign in to comment.