From a82a45f7fc480a7717fc707c331cb1a8fd2b67bc Mon Sep 17 00:00:00 2001 From: Caparrini Date: Thu, 7 Mar 2024 18:44:40 +0100 Subject: [PATCH] Added: Optional use of mlflow, only recording params and fitness btm --- .github/workflows/CI.yml | 2 ++ mloptimizer/aux/tracker.py | 26 +++++++++++++++++-- mloptimizer/genoptimizer/base.py | 20 +++++++++----- .../test/test_genoptimizer/test_meta.py | 10 +++---- 4 files changed, 45 insertions(+), 13 deletions(-) diff --git a/.github/workflows/CI.yml b/.github/workflows/CI.yml index 52746ff..616fe27 100644 --- a/.github/workflows/CI.yml +++ b/.github/workflows/CI.yml @@ -14,6 +14,8 @@ jobs: run: pip install -r requirements.txt - name: Install pytest and pytest-cov run: pip install pytest pytest-cov + - name: Install mlflow + run: pip install mlflow - name: Run tests and collect coverage run: pytest --cov mloptimizer - name: Upload coverage to Codecov diff --git a/mloptimizer/aux/tracker.py b/mloptimizer/aux/tracker.py index d73f41c..cf9c298 100644 --- a/mloptimizer/aux/tracker.py +++ b/mloptimizer/aux/tracker.py @@ -2,6 +2,7 @@ import os import shutil from datetime import datetime +import importlib class Tracker: @@ -17,9 +18,10 @@ class Tracker: log_file : str Name of the log file. """ - def __init__(self, name, folder=os.curdir, log_file="mloptimizer.log"): + + def __init__(self, name, folder=os.curdir, log_file="mloptimizer.log", use_mlflow=False): self.name = name - self.bugs = [] + self.gen = 0 # Main folder, current by default self.folder = create_optimization_folder(folder) # Log files @@ -34,6 +36,12 @@ def __init__(self, name, folder=os.curdir, log_file="mloptimizer.log"): self.results_path = None self.graphics_path = None + # MLFlow + self.use_mlflow = use_mlflow + + if self.use_mlflow: + self.mlflow = importlib.import_module("mlflow") + def start_optimization(self, opt_class): """ Start the optimization process. @@ -63,6 +71,9 @@ def start_checkpoint(self, opt_run_folder_name): datetime.now().strftime("%Y%m%d_%H%M%S"), type(self).__name__) + if self.use_mlflow: + self.mlflow.set_experiment(opt_run_folder_name) + self.opt_run_folder = os.path.join(self.folder, opt_run_folder_name) self.opt_run_checkpoint_path = os.path.join(self.opt_run_folder, "checkpoints") self.results_path = os.path.join(self.opt_run_folder, "results") @@ -81,8 +92,19 @@ def start_checkpoint(self, opt_run_folder_name): ) def log_clfs(self, classifiers_list: list, generation: int, fitness_list: list[int]): + self.gen = generation for i in range(len(classifiers_list)): self.optimization_logger.info(f"Generation {generation} - Classifier TOP {i}") self.optimization_logger.info(f"Classifier: {classifiers_list[i]}") self.optimization_logger.info(f"Fitness: {fitness_list[i]}") self.optimization_logger.info("Hyperparams: {}".format(str(classifiers_list[i].get_params()))) + self.gen = generation + 1 + + def log_evaluation(self, classifier, metric): + self.optimization_logger.info(f"Adding to mlflow...\nClassifier: {classifier}\nFitness: {metric}") + + if self.use_mlflow: + with self.mlflow.start_run(): + self.mlflow.log_params(classifier.get_params()) + # We use the generation as the step + self.mlflow.log_metric(key="fitness", value=metric, step=self.gen) diff --git a/mloptimizer/genoptimizer/base.py b/mloptimizer/genoptimizer/base.py index 06522b4..027f72c 100644 --- a/mloptimizer/genoptimizer/base.py +++ b/mloptimizer/genoptimizer/base.py @@ -44,6 +44,8 @@ class BaseOptimizer(object): seed for the random functions use_parallel : bool flag to use parallel processing + use_mlflow : bool + flag to use mlflow """ __metaclass__ = ABCMeta @@ -51,7 +53,7 @@ def __init__(self, features: np.array, labels: np.array, folder=os.curdir, log_f hyperparam_space: HyperparameterSpace = None, eval_function=train_score, score_function=accuracy_score, seed=random.randint(0, 1000000), - use_parallel=False): + use_parallel=False, use_mlflow=False): """ Creates object BaseOptimizer. @@ -71,6 +73,10 @@ def __init__(self, features: np.array, labels: np.array, folder=os.curdir, log_f function to evaluate the model from X, y, clf score_function : func, optional (default=balanced_accuracy_score) function to score from y, y_pred + use_parallel : bool, optional (default=False) + flag to use parallel processing + use_mlflow : bool, optional (default=False) + flag to use mlflow seed : int, optional (default=0) seed for the random functions (deap, models, and splits on evaluations) """ @@ -80,9 +86,6 @@ def __init__(self, features: np.array, labels: np.array, folder=os.curdir, log_f # Input search space hyperparameters self.hyperparam_space = hyperparam_space - # Tracker - self.tracker = Tracker(name="mloptimizer", folder=folder, log_file=log_file) - self.eval_function = eval_function self.score_function = score_function @@ -96,6 +99,12 @@ def __init__(self, features: np.array, labels: np.array, folder=os.curdir, log_f # Parallel self.use_parallel = use_parallel + # mlflow + self.use_mlflow = use_mlflow + + # Tracker + self.tracker = Tracker(name="mloptimizer", folder=folder, log_file=log_file, use_mlflow=self.use_mlflow) + def set_mlopt_seed(self, seed): """ Method to set the seed for the random functions @@ -207,7 +216,7 @@ def evaluate_clf(self, individual): """ mean = self.eval_function(self.features, self.labels, self.get_clf(individual), score_function=self.score_function) - # TODO: Log parameters and metrics to MLFlow if it is active + self.tracker.log_evaluation(self.get_clf(individual), mean) return (mean,) def population_2_df(self): @@ -309,7 +318,6 @@ def optimize_clf(self, population: int = 10, generations: int = 3, stats.register("max", np.max) start_gen = 0 - # self.file_out.write("Optimizing accuracy:\n") # Using deap, custom for decision tree creator.create("FitnessMax", base.Fitness, weights=(1.0,)) creator.create("Individual", list, fitness=creator.FitnessMax) diff --git a/mloptimizer/test/test_genoptimizer/test_meta.py b/mloptimizer/test/test_genoptimizer/test_meta.py index 9ac830c..4b8de32 100644 --- a/mloptimizer/test/test_genoptimizer/test_meta.py +++ b/mloptimizer/test/test_genoptimizer/test_meta.py @@ -33,12 +33,12 @@ def test_sklearn_optimizer(clf_class): assert mlopt is not None -def test_mloptimizer(): +@pytest.mark.parametrize('use_mlflow', [True, False]) +def test_mloptimizer(use_mlflow): X, y = load_iris(return_X_y=True) - mlopt = SklearnOptimizer(clf_class=DecisionTreeClassifier, - hyperparam_space=HyperparameterSpace(fixed_hyperparams={}, - evolvable_hyperparams=custom_evolvable_hyperparams), - features=X, labels=y) + mlopt = SklearnOptimizer(clf_class=XGBClassifier, + hyperparam_space=HyperparameterSpace.get_default_hyperparameter_space(XGBClassifier), + features=X, labels=y, use_mlflow=use_mlflow) mlopt.optimize_clf(5, 5) assert mlopt is not None