WIP: add benchmark setup for Ridge

soda-inria · Jan 10, 2024 · d0720da · d0720da
1 parent 99021ee
commit d0720da
Show file tree

Hide file tree

Showing 6 changed files with 56 additions and 43 deletions.
diff --git a/.github/workflows/run_benchmark_results_file_sanity_checks.yaml b/.github/workflows/run_benchmark_results_file_sanity_checks.yaml
@@ -19,3 +19,4 @@ jobs:
       run: |
         python ./benchmarks/kmeans/consolidate_result_csv.py ./benchmarks/kmeans/results.csv --check-csv
         python ./benchmarks/pca/consolidate_result_csv.py ./benchmarks/pca/results.csv --check-csv
+        python ./benchmarks/ridge/consolidate_result_csv.py ./benchmarks/ridge/results.csv --check-csv
diff --git a/.github/workflows/sync_benchmark_files_to_gsheet.yaml b/.github/workflows/sync_benchmark_files_to_gsheet.yaml
@@ -24,8 +24,11 @@ jobs:
       run: |
         python ./benchmarks/kmeans/consolidate_result_csv.py ./benchmarks/kmeans/results.csv --check-csv
         python ./benchmarks/pca/consolidate_result_csv.py ./benchmarks/pca/results.csv --check-csv
+        python ./benchmarks/ridge/consolidate_result_csv.py ./benchmarks/pca/results.csv --check-csv
         echo "$GSPREAD_SERVICE_ACCOUNT_AUTH_KEY" > service_account.json
         python ./benchmarks/kmeans/consolidate_result_csv.py ./benchmarks/kmeans/results.csv \
           --sync-to-gspread --gspread-url $GSPREAD_URL --gspread-auth-key ./service_account.json
         python ./benchmarks/pca/consolidate_result_csv.py ./benchmarks/pca/results.csv \
           --sync-to-gspread --gspread-url $GSPREAD_URL --gspread-auth-key ./service_account.json
+        python ./benchmarks/ridge/consolidate_result_csv.py ./benchmarks/pca/results.csv \
+          --sync-to-gspread --gspread-url $GSPREAD_URL --gspread-auth-key ./service_account.json
diff --git a/.github/workflows/test_cpu_benchmarks.yaml b/.github/workflows/test_cpu_benchmarks.yaml
@@ -143,3 +143,5 @@ jobs:
         PYTHONPATH=$PYTHONPATH:$(realpath ../../kmeans_dpcpp/) benchopt run --no-plot -l -d Simulated_correlated_data[n_samples=1000,n_features=14]
         cd ../pca
         benchopt run --no-plot -l -d Simulated_correlated_data[n_samples=100,n_features=100]
+        cd ../ridge
+        benchopt run --no-plot -l -d Simulated_correlated_data[n_samples=100,n_features=100]  # TODO add relevant parameters
diff --git a/README.md b/README.md
@@ -18,6 +18,7 @@ hardware.
 Benchmarks are currently available for the following algorithms:
 - [k-means](https://github.com/soda-inria/sklearn-engine-benchmarks/tree/main/benchmarks/kmeans)
 - [PCA](https://github.com/soda-inria/sklearn-engine-benchmarks/tree/main/benchmarks/pca)
+- [Ridge](https://github.com/soda-inria/sklearn-engine-benchmarks/tree/main/benchmarks/pca)
 
 Here is a (non-exhaustive) list of libraries that are compared in the benchmarks:
 - [scikit-learn](https://scikit-learn.org/stable/index.html)

diff --git a/benchmarks/kmeans/objective.py b/benchmarks/kmeans/objective.py
@@ -32,24 +32,25 @@ def set_data(self, X, **dataset_parameters):
         self.X = X
         dtype = X.dtype
 
-        if self.init == "random" or self.sample_weight == "random":
-            rng = np.random.default_rng(self.random_state)
-
         if self.sample_weight == "None":
             sample_weight = None
         elif self.sample_weight == "unary":
             sample_weight = np.ones(len(X), dtype=dtype)
         elif self.sample_weight == "random":
-            sample_weight = rng.random(size=len(X)).astype(dtype)
+            rng_sample_weight = np.random.default_rng(
+                dataset_parameters["sample_weight"] + 1
+            )
+            sample_weight = rng_sample_weight.random(size=len(X)).astype(dtype)
         else:
             raise ValueError(
                 "Expected 'sample_weight' parameter to be either equal to 'None', "
                 f"'unary' or 'random', but got {sample_weight}."
             )
 
         if self.init == "random":
+            rng_init = np.random.default_rng(self.random_state)
             init = np.array(
-                rng.choice(X, self.n_clusters, replace=False), dtype=X.dtype
+                rng_init.choice(X, self.n_clusters, replace=False), dtype=X.dtype
             )
         elif self.init == "k-means++":
             init = self.init

diff --git a/benchmarks/pca/consolidate_result_csv.py b/benchmarks/pca/consolidate_result_csv.py
@@ -8,7 +8,7 @@
 import pandas as pd
 from pandas.io.parsers.readers import STR_NA_VALUES
 
-GOOGLE_WORKSHEET_NAME = "PCA"
+GOOGLE_WORKSHEET_NAME = "Ridge"
 
 DATES_FORMAT = "%Y-%m-%d"
 
@@ -17,14 +17,15 @@
     "objective_dataset_param___name",
     "objective_dataset_param_n_samples",
     "objective_dataset_param_n_features",
+    "objective_dataset_param_n_targets",
     "objective_dataset_param_dtype",
     "objective_dataset_param_random_state",
-    "objective_objective_param_n_components",
+    "objective_objective_param_alpha",
+    "objective_objective_param_fit_intercept",
+    "objective_objective_param_max_iter",
     "objective_objective_param_tol",
-    "objective_objective_param_iterated_power",
-    "objective_objective_param_n_oversamples",
+    "objective_objective_param_sample_weight",
     "objective_objective_param_random_state",
-    "objective_objective_param_verbose",
 ]
 
 BENCHMARK_DEFINING_COLUMNS = sorted(BENCHMARK_DEFINING_COLUMNS)
@@ -35,21 +36,22 @@
 COMPUTE_DEVICE = "Compute device"
 COMPUTE_RUNTIME = "Compute runtime"
 DATA_RANDOM_STATE = "Data random state"
+DATA_SAMPLE_WEIGHTS = "Data sample weights"
 DTYPE = "Dtype"
-NB_COMPONENTS = "Nb components"
+ALPHA = "alpha"
 NB_DATA_FEATURES = "Nb data features"
 NB_DATA_SAMPLES = "Nb data samples"
-ITERATED_POWER = "Iterated power"
-SVD_SOLVER = "SVD solver"
-POWER_ITERATION_NORMALIZER = "Power iteration normalizer"
+NB_DATA_TARGETS = "Nb data targets"
+SOLVER = "Solver"
 PLATFORM = "Platform"
 PLATFORM_ARCHITECTURE = "Platform architecture"
 PLATFORM_RELEASE = "Platform release"
 SYSTEM_CPUS = "Nb cpus"
 SYSTEM_PROCESSOR = "Cpu name"
 SYSTEM_RAM = "RAM (GB)"
 SYSTEM_GPU = "Gpu name"
-SUM_OF_EXPLAINED_VARIANCE_RATIO = "Sum of explained variance ratio"
+RESULT_NB_ITERATIONS = "Result nb iterations"
+OBJECTIVE_FUNCTION_VALUE = "Result objective value"
 VERSION_INFO = "Version info"
 RUN_DATE = "Run date"
 SOLVER_RANDOM_STATE = "Solver random state"
@@ -62,14 +64,14 @@
     DTYPE,
     NB_DATA_SAMPLES,
     NB_DATA_FEATURES,
-    NB_COMPONENTS,
+    NB_DATA_TARGETS,
+    DATA_SAMPLE_WEIGHTS,
+    ALPHA,
     WALLTIME,
     BACKEND_PROVIDER,
     COMPUTE_DEVICE,
     COMPUTE_RUNTIME,
-    SVD_SOLVER,
-    ITERATED_POWER,
-    POWER_ITERATION_NORMALIZER,
+    SOLVER,
     SYSTEM_CPUS,
     SYSTEM_PROCESSOR,
     SYSTEM_GPU,
@@ -80,7 +82,8 @@
     RUN_DATE,
     VERSION_INFO,
     COMMENT,
-    SUM_OF_EXPLAINED_VARIANCE_RATIO,
+    RESULT_NB_ITERATIONS,
+    OBJECTIVE_FUNCTION_VALUE,
     DATA_RANDOM_STATE,
     SOLVER_RANDOM_STATE,
 ]
@@ -90,15 +93,15 @@
     DTYPE: str,
     NB_DATA_SAMPLES: np.int64,
     NB_DATA_FEATURES: np.int64,
-    NB_COMPONENTS: np.int64,
+    NB_DATA_TARGETS: np.int64,
+    ALPHA: np.float64,
     WALLTIME: np.float64,
     BACKEND_PROVIDER: str,
     COMPUTE_DEVICE: str,
     COMPUTE_RUNTIME: str,
-    SUM_OF_EXPLAINED_VARIANCE_RATIO: np.float64,
-    SVD_SOLVER: str,
-    ITERATED_POWER: str,
-    POWER_ITERATION_NORMALIZER: str,
+    RESULT_NB_ITERATIONS: np.int64,
+    OBJECTIVE_FUNCTION_VALUE: np.float64,
+    SOLVER: str,
     PLATFORM: str,
     PLATFORM_ARCHITECTURE: str,
     PLATFORM_RELEASE: str,
@@ -113,7 +116,7 @@
     COMMENT: str,
 }
 
-COLUMNS_WITH_NONE_STRING = []
+COLUMNS_WITH_NONE_STRING = [DATA_SAMPLE_WEIGHTS]
 
 # If all those fields have equal values for two given benchmarks, then the oldest
 # benchmark (given by RUN_DATE) will be discarded
@@ -122,11 +125,10 @@
     DTYPE,
     NB_DATA_SAMPLES,
     NB_DATA_FEATURES,
-    NB_COMPONENTS,
+    NB_DATA_TARGETS,
+    DATA_SAMPLE_WEIGHTS,
     BACKEND_PROVIDER,
-    SVD_SOLVER,
-    ITERATED_POWER,
-    POWER_ITERATION_NORMALIZER,
+    SOLVER,
     COMPUTE_DEVICE,
     COMPUTE_RUNTIME,
     PLATFORM,
@@ -143,15 +145,16 @@
     (DTYPE, True),
     (NB_DATA_SAMPLES, False),
     (NB_DATA_FEATURES, False),
-    (NB_COMPONENTS, False),
+    (NB_DATA_TARGETS, False),
+    (DATA_SAMPLE_WEIGHTS, True),
+    (ALPHA, False),
     (WALLTIME, True),
     (BACKEND_PROVIDER, True),
     (COMPUTE_DEVICE, True),
     (COMPUTE_RUNTIME, True),
-    (SVD_SOLVER, True),
-    (ITERATED_POWER, True),
-    (POWER_ITERATION_NORMALIZER, True),
-    (SUM_OF_EXPLAINED_VARIANCE_RATIO, False),
+    (RESULT_NB_ITERATIONS, True),
+    (OBJECTIVE_FUNCTION_VALUE, False),
+    (SOLVER, True),
     (SYSTEM_GPU, True),
     (SYSTEM_CPUS, True),
     (PLATFORM, True),
@@ -170,19 +173,21 @@
 
 PARQUET_TABLE_DISPLAY_MAPPING = dict(
     time=WALLTIME,
-    objective_value=SUM_OF_EXPLAINED_VARIANCE_RATIO,
+    objective_value=OBJECTIVE_FUNCTION_VALUE,
+    objective_n_iter=RESULT_NB_ITERATIONS,
     objective_dataset_param_n_samples=NB_DATA_SAMPLES,
     objective_dataset_param_n_features=NB_DATA_FEATURES,
+    objective_dataset_param_n_targets=NB_DATA_TARGETS,
     objective_dataset_param_dtype=DTYPE,
     objective_dataset_param_random_state=DATA_RANDOM_STATE,
-    objective_objective_param_n_components=NB_COMPONENTS,
+    objective_objective_param_alpha=ALPHA,
+    objective_objective_param_fit_intercept=ALPHA,
+    objective_objective_param_alpha=ALPHA,
     objective_objective_param_random_state=SOLVER_RANDOM_STATE,
     objective_solver_param___name=BACKEND_PROVIDER,
     objective_solver_param_device=COMPUTE_DEVICE,
     objective_solver_param_runtime=COMPUTE_RUNTIME,
-    objective_solver_param_power_iteration_normalizer=POWER_ITERATION_NORMALIZER,
-    objective_solver_param_iterated_power=ITERATED_POWER,
-    objective_solver_param_svd_solver=SVD_SOLVER,
+    objective_solver_param_solver=SOLVER,
     objective_solver_param_comment=COMMENT,
     objective_solver_param_version_info=VERSION_INFO,
     objective_solver_param_run_date=RUN_DATE,
@@ -486,7 +491,7 @@ def _gspread_sync(source, gspread_url, gspread_auth_key):
 
     argparser = ArgumentParser(
         description=(
-            "Print an aggregated CSV-formated database of pca benchmark results "
+            "Print an aggregated CSV-formated database of ridge benchmark results "
             "for the sklearn-engine-benchmarks project hosted at "
             "https://github.com/soda-inria/sklearn-engine-benchmarks.\n\n"
             "The inputs are assumed to be a collection of benchopt parquet files and "
@@ -509,15 +514,15 @@ def _gspread_sync(source, gspread_url, gspread_auth_key):
     argparser.add_argument(
         "--check-csv",
         action="store_true",
-        help="Perform a few sanity checks on a CSV database of pca benchmark "
+        help="Perform a few sanity checks on a CSV database of ridge benchmark "
         "results. If this option is passed, then the command only expects a single "
         "input path to a csv file.",
     )
 
     argparser.add_argument(
         "--sync-to-gspread",
         action="store_true",
-        help="Synchronize a CSV database of pca benchmark results to a google "
+        help="Synchronize a CSV database of ridge benchmark results to a google "
         "spreadsheet and format it nicely. If this option is passed, then the command "
         "only expects a single input path to a csv file, and also requires "
         "--gspread-url and --gspread-auth-key.",