From 8f06185f5f916074da845332013238c80941ab7e Mon Sep 17 00:00:00 2001 From: Franck Charras <29153872+fcharras@users.noreply.github.com> Date: Thu, 11 Jan 2024 11:01:40 +0100 Subject: [PATCH] WIP: add benchmark setup for Ridge --- ..._benchmark_results_file_sanity_checks.yaml | 1 + .../sync_benchmark_files_to_gsheet.yaml | 3 + .github/workflows/test_cpu_benchmarks.yaml | 2 + README.md | 1 + benchmarks/kmeans/objective.py | 11 +- benchmarks/ridge/consolidate_result_csv.py | 647 ++++++++++++++++++ benchmarks/ridge/datasets/simulated_blobs.py | 41 ++ benchmarks/ridge/objective.py | 86 +++ benchmarks/ridge/solvers/scikit_learn.py | 76 ++ 9 files changed, 863 insertions(+), 5 deletions(-) create mode 100644 benchmarks/ridge/consolidate_result_csv.py create mode 100644 benchmarks/ridge/datasets/simulated_blobs.py create mode 100644 benchmarks/ridge/objective.py create mode 100644 benchmarks/ridge/solvers/scikit_learn.py diff --git a/.github/workflows/run_benchmark_results_file_sanity_checks.yaml b/.github/workflows/run_benchmark_results_file_sanity_checks.yaml index d427868..a93e386 100644 --- a/.github/workflows/run_benchmark_results_file_sanity_checks.yaml +++ b/.github/workflows/run_benchmark_results_file_sanity_checks.yaml @@ -19,3 +19,4 @@ jobs: run: | python ./benchmarks/kmeans/consolidate_result_csv.py ./benchmarks/kmeans/results.csv --check-csv python ./benchmarks/pca/consolidate_result_csv.py ./benchmarks/pca/results.csv --check-csv + python ./benchmarks/ridge/consolidate_result_csv.py ./benchmarks/ridge/results.csv --check-csv diff --git a/.github/workflows/sync_benchmark_files_to_gsheet.yaml b/.github/workflows/sync_benchmark_files_to_gsheet.yaml index cbd2f25..24505a6 100644 --- a/.github/workflows/sync_benchmark_files_to_gsheet.yaml +++ b/.github/workflows/sync_benchmark_files_to_gsheet.yaml @@ -24,8 +24,11 @@ jobs: run: | python ./benchmarks/kmeans/consolidate_result_csv.py ./benchmarks/kmeans/results.csv --check-csv python ./benchmarks/pca/consolidate_result_csv.py ./benchmarks/pca/results.csv --check-csv + python ./benchmarks/ridge/consolidate_result_csv.py ./benchmarks/pca/results.csv --check-csv echo "$GSPREAD_SERVICE_ACCOUNT_AUTH_KEY" > service_account.json python ./benchmarks/kmeans/consolidate_result_csv.py ./benchmarks/kmeans/results.csv \ --sync-to-gspread --gspread-url $GSPREAD_URL --gspread-auth-key ./service_account.json python ./benchmarks/pca/consolidate_result_csv.py ./benchmarks/pca/results.csv \ --sync-to-gspread --gspread-url $GSPREAD_URL --gspread-auth-key ./service_account.json + python ./benchmarks/ridge/consolidate_result_csv.py ./benchmarks/pca/results.csv \ + --sync-to-gspread --gspread-url $GSPREAD_URL --gspread-auth-key ./service_account.json diff --git a/.github/workflows/test_cpu_benchmarks.yaml b/.github/workflows/test_cpu_benchmarks.yaml index 57b0707..31cfda8 100644 --- a/.github/workflows/test_cpu_benchmarks.yaml +++ b/.github/workflows/test_cpu_benchmarks.yaml @@ -143,3 +143,5 @@ jobs: PYTHONPATH=$PYTHONPATH:$(realpath ../../kmeans_dpcpp/) benchopt run --no-plot -l -d Simulated_correlated_data[n_samples=1000,n_features=14] cd ../pca benchopt run --no-plot -l -d Simulated_correlated_data[n_samples=100,n_features=100] + cd ../ridge + benchopt run --no-plot -l -d Simulated_correlated_data[n_samples=100,n_features=100] # TODO add relevant parameters diff --git a/README.md b/README.md index 82f5e88..214f473 100644 --- a/README.md +++ b/README.md @@ -18,6 +18,7 @@ hardware. Benchmarks are currently available for the following algorithms: - [k-means](https://github.com/soda-inria/sklearn-engine-benchmarks/tree/main/benchmarks/kmeans) - [PCA](https://github.com/soda-inria/sklearn-engine-benchmarks/tree/main/benchmarks/pca) +- [Ridge](https://github.com/soda-inria/sklearn-engine-benchmarks/tree/main/benchmarks/pca) Here is a (non-exhaustive) list of libraries that are compared in the benchmarks: - [scikit-learn](https://scikit-learn.org/stable/index.html) diff --git a/benchmarks/kmeans/objective.py b/benchmarks/kmeans/objective.py index 63f3bf1..c3e860c 100644 --- a/benchmarks/kmeans/objective.py +++ b/benchmarks/kmeans/objective.py @@ -32,15 +32,15 @@ def set_data(self, X, **dataset_parameters): self.X = X dtype = X.dtype - if self.init == "random" or self.sample_weight == "random": - rng = np.random.default_rng(self.random_state) - if self.sample_weight == "None": sample_weight = None elif self.sample_weight == "unary": sample_weight = np.ones(len(X), dtype=dtype) elif self.sample_weight == "random": - sample_weight = rng.random(size=len(X)).astype(dtype) + rng_sample_weight = np.random.default_rng( + dataset_parameters["sample_weight"] + 1 + ) + sample_weight = rng_sample_weight.random(size=len(X)).astype(dtype) else: raise ValueError( "Expected 'sample_weight' parameter to be either equal to 'None', " @@ -48,8 +48,9 @@ def set_data(self, X, **dataset_parameters): ) if self.init == "random": + rng_init = np.random.default_rng(self.random_state) init = np.array( - rng.choice(X, self.n_clusters, replace=False), dtype=X.dtype + rng_init.choice(X, self.n_clusters, replace=False), dtype=X.dtype ) elif self.init == "k-means++": init = self.init diff --git a/benchmarks/ridge/consolidate_result_csv.py b/benchmarks/ridge/consolidate_result_csv.py new file mode 100644 index 0000000..3f780e9 --- /dev/null +++ b/benchmarks/ridge/consolidate_result_csv.py @@ -0,0 +1,647 @@ +import hashlib +from functools import partial +from io import BytesIO +from itertools import zip_longest +from operator import attrgetter + +import numpy as np +import pandas as pd +from pandas.io.parsers.readers import STR_NA_VALUES + +GOOGLE_WORKSHEET_NAME = "Ridge" + +DATES_FORMAT = "%Y-%m-%d" + +BENCHMARK_DEFINING_COLUMNS = [ + "objective_objective_param___name", + "objective_dataset_param___name", + "objective_dataset_param_n_samples", + "objective_dataset_param_n_features", + "objective_dataset_param_n_targets", + "objective_dataset_param_dtype", + "objective_dataset_param_random_state", + "objective_objective_param_alpha", + "objective_objective_param_fit_intercept", + "objective_objective_param_max_iter", + "objective_objective_param_tol", + "objective_objective_param_sample_weight", + "objective_objective_param_random_state", +] + +BENCHMARK_DEFINING_COLUMNS = sorted(BENCHMARK_DEFINING_COLUMNS) +_benchmark_defining_columns_identifier = "".join(sorted(BENCHMARK_DEFINING_COLUMNS)) + +BACKEND_PROVIDER = "Backend provider" +COMMENT = "Comment" +COMPUTE_DEVICE = "Compute device" +COMPUTE_RUNTIME = "Compute runtime" +DATA_RANDOM_STATE = "Data random state" +DATA_SAMPLE_WEIGHTS = "Data sample weights" +DTYPE = "Dtype" +ALPHA = "alpha" +NB_DATA_FEATURES = "Nb data features" +NB_DATA_SAMPLES = "Nb data samples" +NB_DATA_TARGETS = "Nb data targets" +SOLVER = "Solver" +PLATFORM = "Platform" +PLATFORM_ARCHITECTURE = "Platform architecture" +PLATFORM_RELEASE = "Platform release" +SYSTEM_CPUS = "Nb cpus" +SYSTEM_PROCESSOR = "Cpu name" +SYSTEM_RAM = "RAM (GB)" +SYSTEM_GPU = "Gpu name" +RESULT_NB_ITERATIONS = "Result nb iterations" +OBJECTIVE_FUNCTION_VALUE = "Result objective value" +VERSION_INFO = "Version info" +RUN_DATE = "Run date" +SOLVER_RANDOM_STATE = "Solver random state" +WALLTIME = "Walltime" + +BENCHMARK_ID_NAME = "Benchmark id" + +TABLE_DISPLAY_ORDER = [ + BENCHMARK_ID_NAME, + DTYPE, + NB_DATA_SAMPLES, + NB_DATA_FEATURES, + NB_DATA_TARGETS, + DATA_SAMPLE_WEIGHTS, + ALPHA, + WALLTIME, + BACKEND_PROVIDER, + COMPUTE_DEVICE, + COMPUTE_RUNTIME, + SOLVER, + SYSTEM_CPUS, + SYSTEM_PROCESSOR, + SYSTEM_GPU, + SYSTEM_RAM, + PLATFORM, + PLATFORM_ARCHITECTURE, + PLATFORM_RELEASE, + RUN_DATE, + VERSION_INFO, + COMMENT, + RESULT_NB_ITERATIONS, + OBJECTIVE_FUNCTION_VALUE, + DATA_RANDOM_STATE, + SOLVER_RANDOM_STATE, +] + +COLUMNS_DTYPES = { + BENCHMARK_ID_NAME: str, + DTYPE: str, + NB_DATA_SAMPLES: np.int64, + NB_DATA_FEATURES: np.int64, + NB_DATA_TARGETS: np.int64, + ALPHA: np.float64, + WALLTIME: np.float64, + BACKEND_PROVIDER: str, + COMPUTE_DEVICE: str, + COMPUTE_RUNTIME: str, + RESULT_NB_ITERATIONS: np.int64, + OBJECTIVE_FUNCTION_VALUE: np.float64, + SOLVER: str, + PLATFORM: str, + PLATFORM_ARCHITECTURE: str, + PLATFORM_RELEASE: str, + SYSTEM_CPUS: np.int64, + SYSTEM_PROCESSOR: str, + SYSTEM_GPU: str, + SYSTEM_RAM: np.int64, + DATA_RANDOM_STATE: np.int64, + SOLVER_RANDOM_STATE: np.int64, + VERSION_INFO: str, + RUN_DATE: str, + COMMENT: str, +} + +COLUMNS_WITH_NONE_STRING = [DATA_SAMPLE_WEIGHTS] + +# If all those fields have equal values for two given benchmarks, then the oldest +# benchmark (given by RUN_DATE) will be discarded +UNIQUE_BENCHMARK_KEY = [ + BENCHMARK_ID_NAME, + DTYPE, + NB_DATA_SAMPLES, + NB_DATA_FEATURES, + NB_DATA_TARGETS, + DATA_SAMPLE_WEIGHTS, + BACKEND_PROVIDER, + SOLVER, + COMPUTE_DEVICE, + COMPUTE_RUNTIME, + PLATFORM, + PLATFORM_ARCHITECTURE, + SYSTEM_PROCESSOR, + SYSTEM_CPUS, + SYSTEM_GPU, + DATA_RANDOM_STATE, + SOLVER_RANDOM_STATE, +] + +# Importance and say if ascending / descending +ROW_SORT_ORDER = [ + (DTYPE, True), + (NB_DATA_SAMPLES, False), + (NB_DATA_FEATURES, False), + (NB_DATA_TARGETS, False), + (DATA_SAMPLE_WEIGHTS, True), + (ALPHA, False), + (WALLTIME, True), + (BACKEND_PROVIDER, True), + (COMPUTE_DEVICE, True), + (COMPUTE_RUNTIME, True), + (RESULT_NB_ITERATIONS, True), + (OBJECTIVE_FUNCTION_VALUE, False), + (SOLVER, True), + (SYSTEM_GPU, True), + (SYSTEM_CPUS, True), + (PLATFORM, True), + (PLATFORM_ARCHITECTURE, True), + (PLATFORM_RELEASE, True), + (SYSTEM_PROCESSOR, True), + (SYSTEM_RAM, True), + (DATA_RANDOM_STATE, True), + (SOLVER_RANDOM_STATE, True), + (RUN_DATE, False), + (VERSION_INFO, False), + (COMMENT, True), + (BENCHMARK_ID_NAME, True), +] +_row_sort_by, _row_sort_ascending = map(list, zip(*ROW_SORT_ORDER)) + +PARQUET_TABLE_DISPLAY_MAPPING = dict( + time=WALLTIME, + objective_value=OBJECTIVE_FUNCTION_VALUE, + objective_n_iter=RESULT_NB_ITERATIONS, + objective_dataset_param_n_samples=NB_DATA_SAMPLES, + objective_dataset_param_n_features=NB_DATA_FEATURES, + objective_dataset_param_n_targets=NB_DATA_TARGETS, + objective_dataset_param_dtype=DTYPE, + objective_dataset_param_random_state=DATA_RANDOM_STATE, + objective_objective_param_alpha=ALPHA, + objective_objective_param_fit_intercept=ALPHA, + objective_objective_param_alpha=ALPHA, + objective_objective_param_random_state=SOLVER_RANDOM_STATE, + objective_solver_param___name=BACKEND_PROVIDER, + objective_solver_param_device=COMPUTE_DEVICE, + objective_solver_param_runtime=COMPUTE_RUNTIME, + objective_solver_param_solver=SOLVER, + objective_solver_param_comment=COMMENT, + objective_solver_param_version_info=VERSION_INFO, + objective_solver_param_run_date=RUN_DATE, + platform=PLATFORM, +) + +PARQUET_TABLE_DISPLAY_MAPPING.update( + { + "platform-architecture": PLATFORM_ARCHITECTURE, + "platform-release": PLATFORM_RELEASE, + "system-cpus": SYSTEM_CPUS, + "system-processor": SYSTEM_PROCESSOR, + "system-ram (GB)": SYSTEM_RAM, + } +) +_all_table_columns = list(PARQUET_TABLE_DISPLAY_MAPPING) + [BENCHMARK_ID_NAME] + +ALL_EXPECTED_COLUMNS = set(BENCHMARK_DEFINING_COLUMNS + _all_table_columns) + +IDS_LENGTH = 8 + + +def _get_id_from_str(s): + return hashlib.sha256(s.encode("utf8"), usedforsecurity=False).hexdigest()[ + :IDS_LENGTH + ] + + +def _get_sample_id_for_columns(row, defining_colums, constant_identifier): + return _get_id_from_str( + "".join(row[defining_colums].astype(str)) + constant_identifier + ) + + +def _validate_one_parquet_table(source): + df = pd.read_parquet(source) + + # NB: we're lenient on the columns + for col in ALL_EXPECTED_COLUMNS - set(df.columns): + df[col] = None + + df[BENCHMARK_ID_NAME] = df.apply( + lambda row: _get_sample_id_for_columns( + row, BENCHMARK_DEFINING_COLUMNS, _benchmark_defining_columns_identifier + ), + axis=1, + ) + + df = df[_all_table_columns] + df.rename(columns=PARQUET_TABLE_DISPLAY_MAPPING, inplace=True, errors="raise") + + df[RUN_DATE] = df[RUN_DATE].astype("datetime64[ns]") + + return df + + +def _validate_one_csv_table(source, parse_dates=True, order_columns=True): + NA_VALUES = set(STR_NA_VALUES) + NA_VALUES.discard("None") + + df = pd.read_csv( + source, + usecols=TABLE_DISPLAY_ORDER, + dtype=COLUMNS_DTYPES, + index_col=False, + na_values={col: NA_VALUES for col in COLUMNS_WITH_NONE_STRING}, + keep_default_na=False, + ) + + if order_columns: + df = df[TABLE_DISPLAY_ORDER] + + if parse_dates: + df[RUN_DATE] = pd.to_datetime(df[RUN_DATE], format=DATES_FORMAT).astype( + "datetime64[ns]" + ) + + return df + + +def _assemble_output_table( + dfs_from_csv, dfs_from_parquet, parquet_gpu_name, create_gpu_entry, list_known_gpus +): + + if not list_known_gpus and (len(dfs_from_parquet) == 0): + if parquet_gpu_name is not None: + parameter_name = ( + "--parquet-gpu-name" if parquet_gpu_name else "--no-parquet-gpu-name" + ) + raise ValueError( + f"The parameter {parameter_name} should only be used if at least one " + "benchopt parquet table is being consolidated, but only got csv tables." + ) + if create_gpu_entry is not False: + raise ValueError( + "The parameter --create-gpu-entry should only be used if at least one " + "benchopt parquet table is being consolidated, but got only csv tables." + ) + elif not list_known_gpus and parquet_gpu_name is None: + raise ValueError( + "Please use the --parquet-gpu-name parameter to provide a gpu name that " + "will be added to the metadata of the samples in the input parquet tables " + "or use the --no-parquet-gpu-name if you intend to leave the corresponding " + "field empty." + ) + + else: + gpu_names_from_csv = set( + gpu_name + for df in dfs_from_csv + for gpu_name in df[SYSTEM_GPU] + if (len(gpu_name) > 0) + ) + + if list_known_gpus: + print("\n".join(gpu_names_from_csv)) + return False + + if ( + (len(parquet_gpu_name) > 0) + and (parquet_gpu_name not in gpu_names_from_csv) + and not create_gpu_entry + ): + raise IndexError( + f"The gpu name {parquet_gpu_name} is unknown. Please use the " + "--new-gpu-entry parameter to confirm the addition of the new gpu " + "entry in the output csv table, or use --list-known-gpus parameter to " + "print a list of gpus names that have been already registered and use " + "one of those to bypass this error." + ) + + for df in dfs_from_parquet: + df[SYSTEM_GPU] = parquet_gpu_name + + df_list = dfs_from_csv + dfs_from_parquet + + if len(df_list) > 1: + df = pd.concat(df_list, ignore_index=True, copy=False) + else: + df = df_list[0] + + df = df[TABLE_DISPLAY_ORDER] + df.sort_values( + by=_row_sort_by, ascending=_row_sort_ascending, inplace=True, kind="stable" + ) + # HACK: sanitize mix of None values and empty strings that can happen when some + # columns are missing in the parquet input files (because it's optional and no + # solver returns it in the batch) by passing the data to CSV and re-loading + # again from CSV + df = _sanitize_df_with_tocsv(df) + + df.drop_duplicates(subset=UNIQUE_BENCHMARK_KEY, inplace=True, ignore_index=True) + + return df + + +def _sanitize_df_with_tocsv(df): + in_memory_buffer = BytesIO() + _df_to_csv(df, in_memory_buffer) + in_memory_buffer.seek(0) + return _validate_one_csv_table(in_memory_buffer, order_columns=False) + + +def _df_to_csv(df, target): + float_format_fn = partial( + np.format_float_positional, + precision=3, + unique=True, + fractional=False, + trim="-", + sign=False, + pad_left=None, + pad_right=None, + min_digits=None, + ) + df = df.copy() + df[WALLTIME] = df[WALLTIME].map(float_format_fn) + df.to_csv( + target, + index=False, + mode="a", + date_format=DATES_FORMAT, + ) + + +def _gspread_sync(source, gspread_url, gspread_auth_key): + import gspread + + df = _validate_one_csv_table(source, parse_dates=False) + + n_rows, n_cols = df.shape + walltime_worksheet_col = df.columns.get_loc(WALLTIME) + 1 + + gs = gspread.service_account(gspread_auth_key) + sheet = gs.open_by_url(gspread_url) + + global_range = ( + f"{gspread.utils.rowcol_to_a1(1, 1)}:" + f"{gspread.utils.rowcol_to_a1(n_rows + 1, n_cols)}" + ) + + try: + worksheet = sheet.worksheet(GOOGLE_WORKSHEET_NAME) + worksheet.clear() + worksheet.clear_basic_filter() + worksheet.freeze(0, 0) + worksheet.resize(rows=n_rows + 1, cols=n_cols) + worksheet.clear_notes(global_range) + white_background = dict( + backgroundColorStyle=dict(rgbColor=dict(red=1, green=1, blue=1, alpha=1)) + ) + worksheet.format(global_range, white_background) + except gspread.WorksheetNotFound: + worksheet = sheet.add_worksheet( + GOOGLE_WORKSHEET_NAME, rows=n_rows + 1, cols=n_cols + ) + # ensure worksheets are sorted anti-alphabetically + sheet.reorder_worksheets( + sorted(sheet.worksheets(), key=attrgetter("title"), reverse=True) + ) + + # upload all values + worksheet.update( + values=[df.columns.values.tolist()] + df.values.tolist(), range_name="A1" + ) + + # set filter + worksheet.set_basic_filter(1, 1, n_rows + 1, n_cols) + + # freeze filter rows and benchmark-defining cols + worksheet.freeze(rows=1, cols=walltime_worksheet_col) + + format_queries = [] + + # Text is centerd and wrapped in all cells + global_format = dict( + horizontalAlignment="CENTER", + verticalAlignment="MIDDLE", + wrapStrategy="WRAP", + ) + format_queries.append(dict(range=global_range, format=global_format)) + + # benchmark_id and walltime columns are bold + bold_format = dict(textFormat=dict(bold=True)) + benchmark_id_col_range = ( + f"{gspread.utils.rowcol_to_a1(2, 1)}:" + f"{gspread.utils.rowcol_to_a1(n_rows + 1, 1)}" + ) + walltime_col_range = ( + f"{gspread.utils.rowcol_to_a1(2, walltime_worksheet_col)}:" + f"{gspread.utils.rowcol_to_a1(n_rows + 1, walltime_worksheet_col)}" + ) + format_queries.append(dict(range=benchmark_id_col_range, format=bold_format)) + format_queries.append(dict(range=walltime_col_range, format=bold_format)) + + # Header is light-ish yellow + yellow_lighter_header = dict( + backgroundColorStyle=dict( + rgbColor=dict(red=1, green=1, blue=102 / 255, alpha=1) + ) + ) + header_row_range = ( + f"{gspread.utils.rowcol_to_a1(1, 1)}:" + f"{gspread.utils.rowcol_to_a1(1, n_cols)}" + ) + format_queries.append(dict(range=header_row_range, format=yellow_lighter_header)) + + # Every other benchmark_id has greyed background + bright_gray_background = dict( + backgroundColorStyle=dict( + rgbColor=dict(red=232 / 255, green=233 / 255, blue=235 / 255, alpha=1) + ) + ) + benchmark_ids = df[BENCHMARK_ID_NAME] + benchmark_ids_ending_idx = ( + np.where((benchmark_ids.shift() != benchmark_ids).values[1:])[0] + 2 + ) + for benchmark_id_range_start, benchmark_id_range_end in zip_longest( + *(iter(benchmark_ids_ending_idx),) * 2 + ): + benchmark_row_range = ( + f"{gspread.utils.rowcol_to_a1(benchmark_id_range_start + 1, 1)}:" + f"{gspread.utils.rowcol_to_a1(benchmark_id_range_end or (n_rows + 1), n_cols)}" # noqa + ) + format_queries.append( + dict(range=benchmark_row_range, format=bright_gray_background) + ) + + # Apply formats + worksheet.batch_format(format_queries) + + # auto-resize rows and cols + worksheet.columns_auto_resize(0, n_cols - 1) + worksheet.rows_auto_resize(0, n_rows) + + +if __name__ == "__main__": + import os + import sys + from argparse import ArgumentParser + + argparser = ArgumentParser( + description=( + "Print an aggregated CSV-formated database of ridge benchmark results " + "for the sklearn-engine-benchmarks project hosted at " + "https://github.com/soda-inria/sklearn-engine-benchmarks.\n\n" + "The inputs are assumed to be a collection of benchopt parquet files and " + "CSV files, well formated according to the project current specs. This " + "command assumes rhat the inputs are valid and is lenient at checking " + "types, null values, or missing columns, hence the user is advised to " + "cautiously check outputs before using.\n\n" + "If several results are found for identical benchmarks, only the most " + "recent `Run date` value is retained, all anterior entries are discarded " + "from the output CSV." + ) + ) + + argparser.add_argument( + "benchmark_files", + nargs="+", + help="benchopt parquet files or sklearn-engine-benchmarks csv files", + ) + + argparser.add_argument( + "--check-csv", + action="store_true", + help="Perform a few sanity checks on a CSV database of ridge benchmark " + "results. If this option is passed, then the command only expects a single " + "input path to a csv file.", + ) + + argparser.add_argument( + "--sync-to-gspread", + action="store_true", + help="Synchronize a CSV database of ridge benchmark results to a google " + "spreadsheet and format it nicely. If this option is passed, then the command " + "only expects a single input path to a csv file, and also requires " + "--gspread-url and --gspread-auth-key.", + ) + + argparser.add_argument( + "--gspread-url", + help="URL to a google spreadsheet. Expected if and only if --sync-to-gspread " + "is passed.", + ) + + argparser.add_argument( + "--gspread-auth-key", + help="Path to a json authentication key for a gspread service account. " + "Expected if and only if --sync-to-gspread is passed.", + ) + + argparser.add_argument( + "--parquet-gpu-name", + help="Name of the GPU on the host that runs the benchmarks that are recorded " + "in the input parquet files.", + ) + + argparser.add_argument( + "--no-parquet-gpu-name", + action="store_true", + help="Do not insert a GPU name in the metadata of the benchmark samples that " + "were recorded in the input parquet files (and leave it blank).", + ) + + argparser.add_argument( + "--new-gpu-entry", + action="store_true", + help="Use this parameter along with --parquet-gpu-name to confirm that if the " + "GPU name is not yet known in the existing databases, it will be added to the " + "list of known GPU names. Else the command will throw an error.", + ) + + argparser.add_argument( + "--list-known-gpus", + action="store_true", + help="Will print a list of the GPU names that are used in CSV benchmark files.", + ) + + args = argparser.parse_args() + + if (parquet_gpu_name := args.parquet_gpu_name) is None and args.no_parquet_gpu_name: + parquet_gpu_name = "" + + create_gpu_entry = args.new_gpu_entry + list_known_gpus = args.list_known_gpus + + paths = args.benchmark_files + if (check_csv := args.check_csv) or args.sync_to_gspread: + if (n_paths := len(paths)) > 1: + command = "--check-csv" if check_csv else "--sync-to-gspread" + raise ValueError( + f"A single input path to a csv file is expected when the {command} " + f"parameter is passed, but you passed {n_paths - 1} additional " + "arguments." + ) + path = paths[0] + _, file_extension = os.path.splitext(path) + if file_extension != ".csv": + raise ValueError( + "Expecting a '.csv' file extensions, but got " + f"{file_extension} instead !" + ) + + if check_csv: + df_loaded = _validate_one_csv_table(path) + df_clean = _assemble_output_table( + dfs_from_csv=[df_loaded], + dfs_from_parquet=[], + parquet_gpu_name=None, + create_gpu_entry=False, + list_known_gpus=list_known_gpus, + ) + + pd.testing.assert_frame_equal(df_loaded, df_clean) + + if gspread_sync := args.sync_to_gspread: + if (gspread_url := args.gspread_url) is None: + raise ValueError( + "Please provide a URL to a google spreadsheet using the " + "--gspread-url parameter." + ) + + if (gspread_auth_key := args.gspread_auth_key) is None: + raise ValueError( + "Please use the --gspread-auth-key parameter to pass a json " + "authentication key for a service account from the google developer " + "console." + ) + _gspread_sync(path, gspread_url, gspread_auth_key) + + if not check_csv and not gspread_sync: + dfs_from_parquet, dfs_from_csv = [], [] + for path in paths: + _, file_extension = os.path.splitext(path) + if file_extension == ".parquet": + if list_known_gpus: + continue + dfs_from_parquet.append(_validate_one_parquet_table(path)) + elif file_extension == ".csv": + dfs_from_csv.append(_validate_one_csv_table(path, order_columns=False)) + else: + raise ValueError( + "Expecting '.csv' or '.parquet' file extensions, but got " + f"{file_extension} instead !" + ) + + df = _assemble_output_table( + dfs_from_csv=dfs_from_csv, + dfs_from_parquet=dfs_from_parquet, + parquet_gpu_name=parquet_gpu_name, + create_gpu_entry=create_gpu_entry, + list_known_gpus=list_known_gpus, + ) + + if df is not False: + _df_to_csv(df, sys.stdout) diff --git a/benchmarks/ridge/datasets/simulated_blobs.py b/benchmarks/ridge/datasets/simulated_blobs.py new file mode 100644 index 0000000..0ef5605 --- /dev/null +++ b/benchmarks/ridge/datasets/simulated_blobs.py @@ -0,0 +1,41 @@ +from benchopt import BaseDataset, safe_import_context +from benchopt.datasets import make_correlated_data + +with safe_import_context() as import_ctx: + import numpy as np + + +class Dataset(BaseDataset): + name = "Simulated_correlated_data" + + parameters = { + "n_samples, n_features": [ + (20_000_000, 100), + (15_000, 15_000), + (2_000_000, 100), + (5000, 5000), + ], + "n_targets": [1, 5, 20], + "dtype": ["float32"], + "random_state": [123], + } + + def __init__(self, n_samples, n_features, n_targets, dtype, random_state): + self.n_samples = n_samples + self.n_features = n_features + self.n_targets = n_targets + self.random_state = random_state + self.dtype = dtype + + def get_data(self): + rng = np.random.RandomState(self.random_state) + + X, y = make_correlated_data( + self.n_samples, self.n_features, self.n_targets, random_state=rng + ) + + dtype = getattr(np, self.dtype) + + return dict( + X=X.astype(dtype), y=y.astype(dtype), __name=self.name, **self._parameters + ) diff --git a/benchmarks/ridge/objective.py b/benchmarks/ridge/objective.py new file mode 100644 index 0000000..67c8708 --- /dev/null +++ b/benchmarks/ridge/objective.py @@ -0,0 +1,86 @@ +from datetime import datetime + +from benchopt import BaseObjective, safe_import_context + +with safe_import_context() as import_ctx: + import numpy as np + + +class Objective(BaseObjective): + name = "Ridge walltime" + url = "https://github.com/soda-inria/sklearn-engine-benchmarks" + + requirements = ["numpy"] + + # Since our goal is to measure walltime for solvers that perform exact same + # computations, the solver parameters are part of the objective and must be set + # for all solvers, rather than being an independent benchmark space for each + # solver. + parameters = { + "alpha": [1.0], + "fit_intercept": [True], + "solver, max_iter, tol": [("svd", None, 0)], + "sample_weight": ["None", "random"], + "random_state": [123], + } + + def set_data(self, X, **dataset_parameters): + self.X = X + dtype = X.dtype + + if self.sample_weight == "None": + sample_weight = None + elif self.sample_weight == "unary": + sample_weight = np.ones(len(X), dtype=dtype) + elif self.sample_weight == "random": + rng_sample_weight = np.random.default_rng( + dataset_parameters["sample_weight"] + 1 + ) + sample_weight = rng_sample_weight.random(size=len(X)).astype(dtype) + else: + raise ValueError( + "Expected 'sample_weight' parameter to be either equal to 'None', " + f"'unary' or 'random', but got {sample_weight}." + ) + + self.sample_weight_ = sample_weight + self.dataset_parameters = dataset_parameters + + def evaluate_result(self, objective, **solver_parameters): + all_parameters = dict(solver_param_run_date=datetime.today()) + all_parameters.update( + { + ("dataset_param_" + key): value + for key, value in self.dataset_parameters.items() + } + ) + all_parameters.update( + { + ("objective_param_" + key): value + for key, value in self._parameters.items() + } + ) + all_parameters.update( + {("solver_param_" + key): value for key, value in solver_parameters.items()} + ) + return dict( + value=objective, + objective_param___name=self.name, + **all_parameters, + ) + + def get_one_result(self): + return dict(objective=1) + + def get_objective(self): + return dict( + X=self.X, + y=self.y, + sample_weight=self.sample_weight_, + alpha=self.alpha, + fit_intercept=self.fit_intercept, + solver=self.solver, + max_iter=self.max_iter, + tol=self.tol, + random_state=self.random_state, + ) diff --git a/benchmarks/ridge/solvers/scikit_learn.py b/benchmarks/ridge/solvers/scikit_learn.py new file mode 100644 index 0000000..d60f41f --- /dev/null +++ b/benchmarks/ridge/solvers/scikit_learn.py @@ -0,0 +1,76 @@ +from importlib.metadata import version + +from benchopt import BaseSolver, safe_import_context +from benchopt.stopping_criterion import SingleRunCriterion + +with safe_import_context() as import_ctx: + from sklearn.linear_model import Ridge + from sklearn.linear_model._base import _rescale_data + + +class Solver(BaseSolver): + name = "scikit-learn" + requirements = ["scikit-learn"] + + stopping_criterion = SingleRunCriterion(1) + + def set_objective( + self, + X, + y, + sample_weight, + alpha, + fit_intercept, + solver, + max_iter, + tol, + random_state, + ): + # Copy the data before running the benchmark to ensure that no unfortunate side + # effects can happen + self.X = X.copy() + self.y = y.copy() + + if hasattr(sample_weight, "copy"): + sample_weight = sample_weight.copy() + self.sample_weight = sample_weight + + self.alpha = alpha + self.fit_intercept = fit_intercept + self.solver = solver + self.max_iter = max_iter + self.tol = tol + self.random_state = random_state + + def run(self, _): + estimator = Ridge( + alpha=self.alpha, + fit_intercept=self.fit_intercept, + copy_X=False, + max_iter=self.max_iter, + tol=self.tol, + solver=self.solver, + positive=True if (self.solver == "lbfgs") else False, + random_state=self.random_state, + ).fit(self.X, self.y, self.sample_weight) + + self.coef_ = estimator.coef_ + + def get_result(self): + if self.sample_weight is not None: + X, y, _ = _rescale_data(self.X, self.y, self.sample_weight, inplace=True) + + y = y.reshape((y.shape[0], -1)) + + coef_ = self.coef_.reshape((X.shape[0], 1, -1)) + + objective = ((y - (X @ coef_).squeeze(1)) ** 2).sum() + ( + len(y.T) * self.alpha * (coef_**2).sum() + ) + + return dict( + objective=objective, + version_info=f"scikit-learn {version('scikit-learn')}", + __name=self.name, + **self._parameters, + )