Skip to content

Commit

Permalink
WIP: add benchmark setup for Ridge
Browse files Browse the repository at this point in the history
  • Loading branch information
fcharras committed Jan 10, 2024
1 parent 99021ee commit d0720da
Show file tree
Hide file tree
Showing 6 changed files with 56 additions and 43 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -19,3 +19,4 @@ jobs:
run: |
python ./benchmarks/kmeans/consolidate_result_csv.py ./benchmarks/kmeans/results.csv --check-csv
python ./benchmarks/pca/consolidate_result_csv.py ./benchmarks/pca/results.csv --check-csv
python ./benchmarks/ridge/consolidate_result_csv.py ./benchmarks/ridge/results.csv --check-csv
3 changes: 3 additions & 0 deletions .github/workflows/sync_benchmark_files_to_gsheet.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -24,8 +24,11 @@ jobs:
run: |
python ./benchmarks/kmeans/consolidate_result_csv.py ./benchmarks/kmeans/results.csv --check-csv
python ./benchmarks/pca/consolidate_result_csv.py ./benchmarks/pca/results.csv --check-csv
python ./benchmarks/ridge/consolidate_result_csv.py ./benchmarks/pca/results.csv --check-csv
echo "$GSPREAD_SERVICE_ACCOUNT_AUTH_KEY" > service_account.json
python ./benchmarks/kmeans/consolidate_result_csv.py ./benchmarks/kmeans/results.csv \
--sync-to-gspread --gspread-url $GSPREAD_URL --gspread-auth-key ./service_account.json
python ./benchmarks/pca/consolidate_result_csv.py ./benchmarks/pca/results.csv \
--sync-to-gspread --gspread-url $GSPREAD_URL --gspread-auth-key ./service_account.json
python ./benchmarks/ridge/consolidate_result_csv.py ./benchmarks/pca/results.csv \
--sync-to-gspread --gspread-url $GSPREAD_URL --gspread-auth-key ./service_account.json
2 changes: 2 additions & 0 deletions .github/workflows/test_cpu_benchmarks.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -143,3 +143,5 @@ jobs:
PYTHONPATH=$PYTHONPATH:$(realpath ../../kmeans_dpcpp/) benchopt run --no-plot -l -d Simulated_correlated_data[n_samples=1000,n_features=14]
cd ../pca
benchopt run --no-plot -l -d Simulated_correlated_data[n_samples=100,n_features=100]
cd ../ridge
benchopt run --no-plot -l -d Simulated_correlated_data[n_samples=100,n_features=100] # TODO add relevant parameters
1 change: 1 addition & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ hardware.
Benchmarks are currently available for the following algorithms:
- [k-means](https://github.com/soda-inria/sklearn-engine-benchmarks/tree/main/benchmarks/kmeans)
- [PCA](https://github.com/soda-inria/sklearn-engine-benchmarks/tree/main/benchmarks/pca)
- [Ridge](https://github.com/soda-inria/sklearn-engine-benchmarks/tree/main/benchmarks/pca)

Here is a (non-exhaustive) list of libraries that are compared in the benchmarks:
- [scikit-learn](https://scikit-learn.org/stable/index.html)
Expand Down
11 changes: 6 additions & 5 deletions benchmarks/kmeans/objective.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,24 +32,25 @@ def set_data(self, X, **dataset_parameters):
self.X = X
dtype = X.dtype

if self.init == "random" or self.sample_weight == "random":
rng = np.random.default_rng(self.random_state)

if self.sample_weight == "None":
sample_weight = None
elif self.sample_weight == "unary":
sample_weight = np.ones(len(X), dtype=dtype)
elif self.sample_weight == "random":
sample_weight = rng.random(size=len(X)).astype(dtype)
rng_sample_weight = np.random.default_rng(
dataset_parameters["sample_weight"] + 1
)
sample_weight = rng_sample_weight.random(size=len(X)).astype(dtype)
else:
raise ValueError(
"Expected 'sample_weight' parameter to be either equal to 'None', "
f"'unary' or 'random', but got {sample_weight}."
)

if self.init == "random":
rng_init = np.random.default_rng(self.random_state)
init = np.array(
rng.choice(X, self.n_clusters, replace=False), dtype=X.dtype
rng_init.choice(X, self.n_clusters, replace=False), dtype=X.dtype
)
elif self.init == "k-means++":
init = self.init
Expand Down
81 changes: 43 additions & 38 deletions benchmarks/pca/consolidate_result_csv.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
import pandas as pd
from pandas.io.parsers.readers import STR_NA_VALUES

GOOGLE_WORKSHEET_NAME = "PCA"
GOOGLE_WORKSHEET_NAME = "Ridge"

DATES_FORMAT = "%Y-%m-%d"

Expand All @@ -17,14 +17,15 @@
"objective_dataset_param___name",
"objective_dataset_param_n_samples",
"objective_dataset_param_n_features",
"objective_dataset_param_n_targets",
"objective_dataset_param_dtype",
"objective_dataset_param_random_state",
"objective_objective_param_n_components",
"objective_objective_param_alpha",
"objective_objective_param_fit_intercept",
"objective_objective_param_max_iter",
"objective_objective_param_tol",
"objective_objective_param_iterated_power",
"objective_objective_param_n_oversamples",
"objective_objective_param_sample_weight",
"objective_objective_param_random_state",
"objective_objective_param_verbose",
]

BENCHMARK_DEFINING_COLUMNS = sorted(BENCHMARK_DEFINING_COLUMNS)
Expand All @@ -35,21 +36,22 @@
COMPUTE_DEVICE = "Compute device"
COMPUTE_RUNTIME = "Compute runtime"
DATA_RANDOM_STATE = "Data random state"
DATA_SAMPLE_WEIGHTS = "Data sample weights"
DTYPE = "Dtype"
NB_COMPONENTS = "Nb components"
ALPHA = "alpha"
NB_DATA_FEATURES = "Nb data features"
NB_DATA_SAMPLES = "Nb data samples"
ITERATED_POWER = "Iterated power"
SVD_SOLVER = "SVD solver"
POWER_ITERATION_NORMALIZER = "Power iteration normalizer"
NB_DATA_TARGETS = "Nb data targets"
SOLVER = "Solver"
PLATFORM = "Platform"
PLATFORM_ARCHITECTURE = "Platform architecture"
PLATFORM_RELEASE = "Platform release"
SYSTEM_CPUS = "Nb cpus"
SYSTEM_PROCESSOR = "Cpu name"
SYSTEM_RAM = "RAM (GB)"
SYSTEM_GPU = "Gpu name"
SUM_OF_EXPLAINED_VARIANCE_RATIO = "Sum of explained variance ratio"
RESULT_NB_ITERATIONS = "Result nb iterations"
OBJECTIVE_FUNCTION_VALUE = "Result objective value"
VERSION_INFO = "Version info"
RUN_DATE = "Run date"
SOLVER_RANDOM_STATE = "Solver random state"
Expand All @@ -62,14 +64,14 @@
DTYPE,
NB_DATA_SAMPLES,
NB_DATA_FEATURES,
NB_COMPONENTS,
NB_DATA_TARGETS,
DATA_SAMPLE_WEIGHTS,
ALPHA,
WALLTIME,
BACKEND_PROVIDER,
COMPUTE_DEVICE,
COMPUTE_RUNTIME,
SVD_SOLVER,
ITERATED_POWER,
POWER_ITERATION_NORMALIZER,
SOLVER,
SYSTEM_CPUS,
SYSTEM_PROCESSOR,
SYSTEM_GPU,
Expand All @@ -80,7 +82,8 @@
RUN_DATE,
VERSION_INFO,
COMMENT,
SUM_OF_EXPLAINED_VARIANCE_RATIO,
RESULT_NB_ITERATIONS,
OBJECTIVE_FUNCTION_VALUE,
DATA_RANDOM_STATE,
SOLVER_RANDOM_STATE,
]
Expand All @@ -90,15 +93,15 @@
DTYPE: str,
NB_DATA_SAMPLES: np.int64,
NB_DATA_FEATURES: np.int64,
NB_COMPONENTS: np.int64,
NB_DATA_TARGETS: np.int64,
ALPHA: np.float64,
WALLTIME: np.float64,
BACKEND_PROVIDER: str,
COMPUTE_DEVICE: str,
COMPUTE_RUNTIME: str,
SUM_OF_EXPLAINED_VARIANCE_RATIO: np.float64,
SVD_SOLVER: str,
ITERATED_POWER: str,
POWER_ITERATION_NORMALIZER: str,
RESULT_NB_ITERATIONS: np.int64,
OBJECTIVE_FUNCTION_VALUE: np.float64,
SOLVER: str,
PLATFORM: str,
PLATFORM_ARCHITECTURE: str,
PLATFORM_RELEASE: str,
Expand All @@ -113,7 +116,7 @@
COMMENT: str,
}

COLUMNS_WITH_NONE_STRING = []
COLUMNS_WITH_NONE_STRING = [DATA_SAMPLE_WEIGHTS]

# If all those fields have equal values for two given benchmarks, then the oldest
# benchmark (given by RUN_DATE) will be discarded
Expand All @@ -122,11 +125,10 @@
DTYPE,
NB_DATA_SAMPLES,
NB_DATA_FEATURES,
NB_COMPONENTS,
NB_DATA_TARGETS,
DATA_SAMPLE_WEIGHTS,
BACKEND_PROVIDER,
SVD_SOLVER,
ITERATED_POWER,
POWER_ITERATION_NORMALIZER,
SOLVER,
COMPUTE_DEVICE,
COMPUTE_RUNTIME,
PLATFORM,
Expand All @@ -143,15 +145,16 @@
(DTYPE, True),
(NB_DATA_SAMPLES, False),
(NB_DATA_FEATURES, False),
(NB_COMPONENTS, False),
(NB_DATA_TARGETS, False),
(DATA_SAMPLE_WEIGHTS, True),
(ALPHA, False),
(WALLTIME, True),
(BACKEND_PROVIDER, True),
(COMPUTE_DEVICE, True),
(COMPUTE_RUNTIME, True),
(SVD_SOLVER, True),
(ITERATED_POWER, True),
(POWER_ITERATION_NORMALIZER, True),
(SUM_OF_EXPLAINED_VARIANCE_RATIO, False),
(RESULT_NB_ITERATIONS, True),
(OBJECTIVE_FUNCTION_VALUE, False),
(SOLVER, True),
(SYSTEM_GPU, True),
(SYSTEM_CPUS, True),
(PLATFORM, True),
Expand All @@ -170,19 +173,21 @@

PARQUET_TABLE_DISPLAY_MAPPING = dict(
time=WALLTIME,
objective_value=SUM_OF_EXPLAINED_VARIANCE_RATIO,
objective_value=OBJECTIVE_FUNCTION_VALUE,
objective_n_iter=RESULT_NB_ITERATIONS,
objective_dataset_param_n_samples=NB_DATA_SAMPLES,
objective_dataset_param_n_features=NB_DATA_FEATURES,
objective_dataset_param_n_targets=NB_DATA_TARGETS,
objective_dataset_param_dtype=DTYPE,
objective_dataset_param_random_state=DATA_RANDOM_STATE,
objective_objective_param_n_components=NB_COMPONENTS,
objective_objective_param_alpha=ALPHA,
objective_objective_param_fit_intercept=ALPHA,
objective_objective_param_alpha=ALPHA,
objective_objective_param_random_state=SOLVER_RANDOM_STATE,
objective_solver_param___name=BACKEND_PROVIDER,
objective_solver_param_device=COMPUTE_DEVICE,
objective_solver_param_runtime=COMPUTE_RUNTIME,
objective_solver_param_power_iteration_normalizer=POWER_ITERATION_NORMALIZER,
objective_solver_param_iterated_power=ITERATED_POWER,
objective_solver_param_svd_solver=SVD_SOLVER,
objective_solver_param_solver=SOLVER,
objective_solver_param_comment=COMMENT,
objective_solver_param_version_info=VERSION_INFO,
objective_solver_param_run_date=RUN_DATE,
Expand Down Expand Up @@ -486,7 +491,7 @@ def _gspread_sync(source, gspread_url, gspread_auth_key):

argparser = ArgumentParser(
description=(
"Print an aggregated CSV-formated database of pca benchmark results "
"Print an aggregated CSV-formated database of ridge benchmark results "
"for the sklearn-engine-benchmarks project hosted at "
"https://github.com/soda-inria/sklearn-engine-benchmarks.\n\n"
"The inputs are assumed to be a collection of benchopt parquet files and "
Expand All @@ -509,15 +514,15 @@ def _gspread_sync(source, gspread_url, gspread_auth_key):
argparser.add_argument(
"--check-csv",
action="store_true",
help="Perform a few sanity checks on a CSV database of pca benchmark "
help="Perform a few sanity checks on a CSV database of ridge benchmark "
"results. If this option is passed, then the command only expects a single "
"input path to a csv file.",
)

argparser.add_argument(
"--sync-to-gspread",
action="store_true",
help="Synchronize a CSV database of pca benchmark results to a google "
help="Synchronize a CSV database of ridge benchmark results to a google "
"spreadsheet and format it nicely. If this option is passed, then the command "
"only expects a single input path to a csv file, and also requires "
"--gspread-url and --gspread-auth-key.",
Expand Down

0 comments on commit d0720da

Please sign in to comment.