diff --git a/python/cuml/cuml/cluster/dbscan.pyx b/python/cuml/cuml/cluster/dbscan.pyx index 9480ccccf6..f5b0b8d85d 100644 --- a/python/cuml/cuml/cluster/dbscan.pyx +++ b/python/cuml/cuml/cluster/dbscan.pyx @@ -225,6 +225,20 @@ class DBSCAN(UniversalBase, core_sample_indices_ = CumlArrayDescriptor(order="C") labels_ = CumlArrayDescriptor(order="C") + _hyperparam_interop_translator = { + "metric": { + "manhattan": "dispatch", + "chebyshev": "dispatch", + "minkowski": "dispatch", + }, + + "algorithm": { + "auto": "brute", + "ball_tree": "dispatch", + "kd_tree": "dispatch", + }, + } + @device_interop_preparation def __init__(self, *, eps=0.5, diff --git a/python/cuml/cuml/decomposition/pca.pyx b/python/cuml/cuml/decomposition/pca.pyx index 9433f724b9..db2f0f62c8 100644 --- a/python/cuml/cuml/decomposition/pca.pyx +++ b/python/cuml/cuml/decomposition/pca.pyx @@ -280,6 +280,16 @@ class PCA(UniversalBase, noise_variance_ = CumlArrayDescriptor(order='F') trans_input_ = CumlArrayDescriptor(order='F') + _hyperparam_interop_translator = { + "svd_solver": { + "arpack": "full", + "randomized": "full" + }, + "iterated_power": { + "auto": 15, + }, + } + @device_interop_preparation def __init__(self, *, copy=True, handle=None, iterated_power=15, n_components=None, random_state=None, svd_solver='auto', diff --git a/python/cuml/cuml/experimental/accel/__init__.py b/python/cuml/cuml/experimental/accel/__init__.py new file mode 100644 index 0000000000..1a54b59459 --- /dev/null +++ b/python/cuml/cuml/experimental/accel/__init__.py @@ -0,0 +1,50 @@ +# +# Copyright (c) 2024, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + + +from .magics import load_ipython_extension + +# from .profiler import Profiler + +__all__ = ["load_ipython_extension", "install"] + + +LOADED = False + + +def install(): + """Enable cuML Accelerator Mode.""" + from .module_accelerator import ModuleAccelerator + + print("Installing cuML Accelerator...") + loader = ModuleAccelerator.install("sklearn", "cuml", "sklearn") + loader_umap = ModuleAccelerator.install("umap", "cuml", "umap") + loader_hdbscan = ModuleAccelerator.install("hdbscan", "cuml", "hdbscan") + global LOADED + LOADED = all( + var is not None for var in [loader, loader_umap, loader_hdbscan] + ) + + +def pytest_load_initial_conftests(early_config, parser, args): + # https://docs.pytest.org/en/7.1.x/reference/\ + # reference.html#pytest.hookspec.pytest_load_initial_conftests + try: + install() + except RuntimeError: + raise RuntimeError( + "An existing plugin has already loaded sklearn. Interposing failed." + ) diff --git a/python/cuml/cuml/experimental/accel/__main__.py b/python/cuml/cuml/experimental/accel/__main__.py new file mode 100644 index 0000000000..85b8795f80 --- /dev/null +++ b/python/cuml/cuml/experimental/accel/__main__.py @@ -0,0 +1,88 @@ +# +# Copyright (c) 2024, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +import click +import code +import os +import runpy +import sys + +from . import install +from cuml.internals import logger + + +@click.command() +@click.option("-m", "module", required=False, help="Module to run") +@click.option( + "--profile", + is_flag=True, + default=False, + help="Perform per-function profiling of this script.", +) +@click.option( + "--line-profile", + is_flag=True, + default=False, + help="Perform per-line profiling of this script.", +) +@click.option( + "--strict", + is_flag=True, + default=False, + help="Turn strict mode for hyperparameters on.", +) +@click.argument("args", nargs=-1) +def main(module, profile, line_profile, strict, args): + """ """ + + # todo (dgd): add option to lower verbosity + logger.set_level(logger.level_debug) + logger.set_pattern("%v") + + if strict: + os.environ["CUML_ACCEL_STRICT_MODE"] = "ON" + + install() + + if module: + (module,) = module + # run the module passing the remaining arguments + # as if it were run with python -m + sys.argv[:] = [module] + args # not thread safe? + runpy.run_module(module, run_name="__main__") + elif len(args) >= 1: + # Remove ourself from argv and continue + sys.argv[:] = args + runpy.run_path(args[0], run_name="__main__") + else: + if sys.stdin.isatty(): + banner = f"Python {sys.version} on {sys.platform}" + site_import = not sys.flags.no_site + if site_import: + cprt = 'Type "help", "copyright", "credits" or "license" for more information.' + banner += "\n" + cprt + else: + # Don't show prompts or banners if stdin is not a TTY + sys.ps1 = "" + sys.ps2 = "" + banner = "" + + # Launch an interactive interpreter + code.interact(banner=banner, exitmsg="") + + +if __name__ == "__main__": + main() diff --git a/python/cuml/cuml/experimental/accel/_wrappers/__init__.py b/python/cuml/cuml/experimental/accel/_wrappers/__init__.py new file mode 100644 index 0000000000..32ea7c7bee --- /dev/null +++ b/python/cuml/cuml/experimental/accel/_wrappers/__init__.py @@ -0,0 +1,34 @@ +# +# Copyright (c) 2024, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +wrapped_estimators = { + "KMeans": ("cuml.cluster", "KMeans"), + "DBSCAN": ("cuml.cluster", "DBSCAN"), + "PCA": ("cuml.decomposition", "PCA"), + "TruncatedSVD": ("cuml.decomposition", "TruncatedSVD"), + "KernelRidge": ("cuml.kernel_ridge", "KernelRidge"), + "LinearRegression": ("cuml.linear_model", "LinearRegression"), + "LogisticRegression": ("cuml.linear_model", "LogisticRegression"), + "ElasticNet": ("cuml.linear_model", "ElasticNet"), + "Ridge": ("cuml.linear_model", "Ridge"), + "Lasso": ("cuml.linear_model", "Lasso"), + "TSNE": ("cuml.manifold", "TSNE"), + "NearestNeighbors": ("cuml.neighbors", "NearestNeighbors"), + "KNeighborsClassifier": ("cuml.neighbors", "KNeighborsClassifier"), + "KNeighborsRegressor": ("cuml.neighbors", "KNeighborsRegressor"), + "UMAP": ("cuml.manifold", "UMAP"), + "HDBSCAN": ("cuml.cluster", "HDBSCAN"), +} diff --git a/python/cuml/cuml/experimental/accel/_wrappers/hdbscan.py b/python/cuml/cuml/experimental/accel/_wrappers/hdbscan.py new file mode 100644 index 0000000000..24d182f41c --- /dev/null +++ b/python/cuml/cuml/experimental/accel/_wrappers/hdbscan.py @@ -0,0 +1,24 @@ +# +# Copyright (c) 2024, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +from ..estimator_proxy import intercept + + +UMAP = intercept( + original_module="hdbscan", + accelerated_module="cuml.cluster", + original_class_name="HDBSCAN", +) diff --git a/python/cuml/cuml/experimental/accel/_wrappers/sklearn.py b/python/cuml/cuml/experimental/accel/_wrappers/sklearn.py new file mode 100644 index 0000000000..8ae1587d67 --- /dev/null +++ b/python/cuml/cuml/experimental/accel/_wrappers/sklearn.py @@ -0,0 +1,223 @@ +# +# Copyright (c) 2024, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +from ..estimator_proxy import intercept + + +############################################################################### +# Clustering Estimators # +############################################################################### + +# AgglomerativeClustering = intercept(original_module="sklearn.cluster", +# accelerated_module="cuml.cluster", +# original_class_name="AgglomerativeClustering") + +KMeans = intercept( + original_module="sklearn.cluster", + accelerated_module="cuml.cluster", + original_class_name="KMeans", +) + +DBSCAN = intercept( + original_module="sklearn.cluster", + accelerated_module="cuml.cluster", + original_class_name="DBSCAN", +) + +# HDBSCAN = intercept( +# original_module="sklearn.cluster", +# accelerated_module="cuml.cluster", +# original_class_name="HDBSCAN", +# ) + + +############################################################################### +# Decomposition Estimators # +############################################################################### + + +PCA = intercept( + original_module="sklearn.decomposition", + accelerated_module="cuml.decomposition", + original_class_name="PCA", +) + + +# IncrementalPCA = intercept(original_module="sklearn.decomposition", +# accelerated_module="cuml.decomposition", +# original_class_name="IncrementalPCA") + + +TruncatedSVD = intercept( + original_module="sklearn.decomposition", + accelerated_module="cuml.decomposition", + original_class_name="TruncatedSVD", +) + + +############################################################################### +# Ensemble Estimators # +############################################################################### + + +# RandomForestClassifier = intercept(original_module="sklearn.ensemble", +# accelerated_module="cuml.ensemble", +# original_class_name="RandomForestClassifier") + +# RandomForestRegressor = intercept(original_module="sklearn.decomposition", +# accelerated_module="cuml.decomposition", +# original_class_name="RandomForestRegressor") + + +############################################################################### +# Linear Estimators # +############################################################################### + +KernelRidge = intercept( + original_module="sklearn.kernel_ridge", + accelerated_module="cuml.kernel_ridge", + original_class_name="KernelRidge", +) + +LinearRegression = intercept( + original_module="sklearn.linear_model", + accelerated_module="cuml.linear_model", + original_class_name="LinearRegression", +) + +LogisticRegression = intercept( + original_module="sklearn.linear_model", + accelerated_module="cuml.linear_model", + original_class_name="LogisticRegression", +) + +ElasticNet = intercept( + original_module="sklearn.linear_model", + accelerated_module="cuml.linear_model", + original_class_name="ElasticNet", +) + +Ridge = intercept( + original_module="sklearn.linear_model", + accelerated_module="cuml.linear_model", + original_class_name="Ridge", +) + +Lasso = intercept( + original_module="sklearn.linear_model", + accelerated_module="cuml.linear_model", + original_class_name="Lasso", +) + + +############################################################################### +# Manifold Estimators # +############################################################################### + +TSNE = intercept( + original_module="sklearn.manifold", + accelerated_module="cuml.manifold", + original_class_name="TSNE", +) + + +############################################################################### +# Bayes Estimators # +############################################################################### + +# GaussianNB = intercept(original_module="sklearn.naive_bayes", +# accelerated_module="cuml.naive_bayes", +# original_class_name="GaussianNB") + +# MultinomialNB = intercept(original_module="sklearn.naive_bayes", +# accelerated_module="cuml.naive_bayes", +# original_class_name="MultinomialNB") + +# BernoulliNB = intercept(original_module="sklearn.naive_bayes", +# accelerated_module="cuml.naive_bayes", +# original_class_name="BernoulliNB") + +# ComplementNB = intercept(original_module="sklearn.naive_bayes", +# accelerated_module="cuml.naive_bayes", +# original_class_name="ComplementNB") + + +############################################################################### +# Neighbors Estimators # +############################################################################### + + +NearestNeighbors = intercept( + original_module="sklearn.neighbors", + accelerated_module="cuml.neighbors", + original_class_name="NearestNeighbors", +) + +KNeighborsClassifier = intercept( + original_module="sklearn.neighbors", + accelerated_module="cuml.neighbors", + original_class_name="KNeighborsClassifier", +) + +KNeighborsRegressor = intercept( + original_module="sklearn.neighbors", + accelerated_module="cuml.neighbors", + original_class_name="KNeighborsRegressor", +) + +############################################################################### +# Rand Proj Estimators # +############################################################################### + + +# GaussianRandomProjection = intercept(original_module="sklearn.random_projection", +# accelerated_module="cuml.random_projection", +# original_class_name="GaussianRandomProjection") + + +# SparseRandomProjection = intercept(original_module="sklearn.random_projection", +# accelerated_module="cuml.random_projection", +# original_class_name="SparseRandomProjection") + + +############################################################################### +# SVM Estimators # +############################################################################### + + +# LinearSVC = intercept(original_module="sklearn.svm", +# accelerated_module="cuml.svm", +# original_class_name="LinearSVC") + +# LinearSVR = intercept(original_module="sklearn.svm", +# accelerated_module="cuml.svm", +# original_class_name="LinearSVR") + +# SVC = intercept(original_module="sklearn.svm", +# accelerated_module="cuml.svm", +# original_class_name="SVC") + +# SVR = intercept(original_module="sklearn.svm", +# accelerated_module="cuml.svm", +# original_class_name="SVR") + + +############################################################################### +# TSA Estimators # +############################################################################### + + +# not supported yet diff --git a/python/cuml/cuml/experimental/accel/_wrappers/umap.py b/python/cuml/cuml/experimental/accel/_wrappers/umap.py new file mode 100644 index 0000000000..dd8b6864b0 --- /dev/null +++ b/python/cuml/cuml/experimental/accel/_wrappers/umap.py @@ -0,0 +1,24 @@ +# +# Copyright (c) 2024, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +from ..estimator_proxy import intercept + + +UMAP = intercept( + original_module="umap", + accelerated_module="cuml.manifold", + original_class_name="UMAP", +) diff --git a/python/cuml/cuml/experimental/accel/annotation.py b/python/cuml/cuml/experimental/accel/annotation.py new file mode 100644 index 0000000000..47b0017a3c --- /dev/null +++ b/python/cuml/cuml/experimental/accel/annotation.py @@ -0,0 +1,47 @@ +# +# Copyright (c) 2024, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +from __future__ import annotations + +try: + import nvtx +except ImportError: + + class nvtx: # type: ignore + """Noop-stub with the same API as nvtx.""" + + push_range = lambda *args, **kwargs: None # noqa: E731 + pop_range = lambda *args, **kwargs: None # noqa: E731 + + class annotate: + """No-op annotation/context-manager""" + + def __init__( + self, + message: str | None = None, + color: str | None = None, + domain: str | None = None, + category: str | int | None = None, + ): + pass + + def __enter__(self): + return self + + def __exit__(self, *exc): + return False + + __call__ = lambda self, fn: fn # noqa: E731 diff --git a/python/cuml/cuml/experimental/accel/estimator_proxy.py b/python/cuml/cuml/experimental/accel/estimator_proxy.py new file mode 100644 index 0000000000..0feda124bc --- /dev/null +++ b/python/cuml/cuml/experimental/accel/estimator_proxy.py @@ -0,0 +1,146 @@ +# +# Copyright (c) 2024, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + + +import cuml +import importlib +import inspect +import os + +from cuml.internals.global_settings import GlobalSettings +from cuml.internals.mem_type import MemoryType +from cuml.internals import logger +from cuml.internals.safe_imports import gpu_only_import, cpu_only_import +from typing import Optional + + +# currently we just use this dictionary for debugging purposes +patched_classes = {} + + +def intercept( + original_module: str, + accelerated_module: str, + original_class_name: str, + accelerated_class_name: Optional[str] = None, +): + + if accelerated_class_name is None: + accelerated_class_name = original_class_name + # Import the original host module and cuML + module_a = cpu_only_import(original_module) + module_b = gpu_only_import(accelerated_module) + + # Store a reference to the original (CPU) class + if original_class_name in patched_classes: + original_class_a = patched_classes[original_class_name] + else: + original_class_a = getattr(module_a, original_class_name) + patched_classes[original_class_name] = original_class_a + + # Get the class from cuML so ProxyEstimator inherits from it + class_b = getattr(module_b, accelerated_class_name) + + # todo (dgd): add environment variable to disable this + class ProxyEstimatorMeta(cuml.internals.base_helpers.BaseMetaClass): + def __repr__(cls): + return repr(original_class_a) + + class ProxyEstimator(class_b, metaclass=ProxyEstimatorMeta): + def __init__(self, *args, **kwargs): + self._cpu_model_class = ( + original_class_a # Store a reference to the original class + ) + # print("HYPPPPP") + kwargs, self._gpuaccel = self._hyperparam_translator(**kwargs) + super().__init__(*args, **kwargs) + + self._cpu_hyperparams = list( + inspect.signature( + self._cpu_model_class.__init__ + ).parameters.keys() + ) + + def __repr__(self): + return f"wrapped {self._cpu_model_class}" + + def __str__(self): + return f"ProxyEstimator of {self._cpu_model_class}" + + def __getstate__(self): + if not hasattr(self, "_cpu_model"): + self.import_cpu_model() + self.build_cpu_model() + + self.gpu_to_cpu() + + return self._cpu_model.__dict__.copy() + + def __reduce__(self): + import pickle + from .module_accelerator import disable_module_accelerator + + with disable_module_accelerator(): + filename = self.__class__.__name__ + "_sklearn" + with open(filename, "wb") as f: + pickle.dump(self._cpu_model_class, f) + + return ( + reconstruct_proxy, + ( + original_module, + accelerated_module, + original_class_name, + self.__getstate__(), + ), + ) + + def __setstate__(self, state): + print(f"state: {state}") + self._cpu_model_class = ( + original_class_a # Store a reference to the original class + ) + super().__init__() + self.import_cpu_model() + self._cpu_model = self._cpu_model_class() + self._cpu_model.__dict__.update(state) + self.cpu_to_gpu() + self.output_type = "numpy" + self.output_mem_type = MemoryType.host + + logger.debug( + f"Created proxy estimator: ({module_b}, {original_class_name}, {ProxyEstimator})" + ) + setattr(module_b, original_class_name, ProxyEstimator) + + # This is currently needed for pytest only + if "PYTEST_CURRENT_TEST" in os.environ: + setattr(module_a, original_class_name, ProxyEstimator) + + return ProxyEstimator + + +def reconstruct_proxy(original_module, new_module, class_name_a, args, kwargs): + "Function needed to pickle since ProxyEstimator is" + # We probably don't need to intercept again here, since we already stored + # the variables in _wrappers + cls = intercept( + original_module=original_module, + accelerated_module=new_module, + original_class_name=class_name_a, + ) + + return cls(*args, **kwargs) diff --git a/python/cuml/cuml/experimental/accel/fast_slow_proxy.py b/python/cuml/cuml/experimental/accel/fast_slow_proxy.py new file mode 100644 index 0000000000..5ae36110f8 --- /dev/null +++ b/python/cuml/cuml/experimental/accel/fast_slow_proxy.py @@ -0,0 +1,1234 @@ +# +# Copyright (c) 2024, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +from __future__ import annotations + +import functools +import inspect +import operator +import os +import pickle +import types +import warnings +from collections.abc import Iterator +from enum import IntEnum +from typing import Any, Callable, Literal, Mapping + +import numpy as np + +# from ..options import _env_get_bool +# from ..testing import assert_eq +from .annotation import nvtx + + +def _env_get_int(name, default): + try: + return int(os.getenv(name, default)) + except (ValueError, TypeError): + return default + + +def _env_get_bool(name, default): + env = os.getenv(name) + if env is None: + return default + as_a_int = _env_get_int(name, None) + env = env.lower().strip() + if env == "true" or env == "on" or as_a_int: + return True + if env == "false" or env == "off" or as_a_int == 0: + return False + return default + + +def call_operator(fn, args, kwargs): + return fn(*args, **kwargs) + + +_CUML_ACCEL_NVTX_COLORS = { + "COPY_SLOW_TO_FAST": 0xCA0020, + "COPY_FAST_TO_SLOW": 0xF4A582, + "EXECUTE_FAST": 0x92C5DE, + "EXECUTE_SLOW": 0x0571B0, +} + + +_WRAPPER_ASSIGNMENTS = tuple( + attr + for attr in functools.WRAPPER_ASSIGNMENTS + # Skip __doc__ because we assign it on class creation using exec_body + # callable that updates the namespace of the class. + # Skip __annotations__ because there are differences between Python + # versions on how it is initialized for a class that doesn't explicitly + # define it and we don't want to force eager evaluation of anything that + # would normally be lazy (mostly for consistency, shouldn't cause any + # significant issues). + if attr not in ("__annotations__", "__doc__") +) + + +def callers_module_name(): + # Call f_back twice since this function adds an extra frame + return inspect.currentframe().f_back.f_back.f_globals["__name__"] + + +class _State(IntEnum): + """Simple enum to track the type of wrapped object of a final proxy""" + + SLOW = 0 + FAST = 1 + + +class _Unusable: + """ + A totally unusable type. When a "fast" object is not available, + it's useful to set it to _Unusable() so that any operations + on it fail, and ensure fallback to the corresponding + "slow" object. + """ + + def __call__(self, *args: Any, **kwds: Any) -> Any: + raise NotImplementedError( + "Fast implementation not available. " + "Falling back to the slow implementation" + ) + + def __getattribute__(self, name: str) -> Any: + if name in {"__class__"}: # needed for type introspection + return super().__getattribute__(name) + raise TypeError("Unusable type. Falling back to the slow object") + + def __repr__(self) -> str: + raise AttributeError("Unusable type. Falling back to the slow object") + + +class _PickleConstructor: + """A pickleable object to support construction in __reduce__. + + This object is used to avoid having unpickling call __init__ on the + objects, instead only invoking __new__. __init__ may have required + arguments or otherwise perform invalid initialization that we could skip + altogether since we're going to overwrite the wrapped object. + """ + + def __init__(self, type_): + self._type = type_ + + def __call__(self): + return object.__new__(self._type) + + +_DELETE = object() + + +def make_final_proxy_type( + name: str, + fast_type: type, + slow_type: type, + *, + fast_to_slow: Callable, + slow_to_fast: Callable, + module: str | None = None, + additional_attributes: Mapping[str, Any] | None = None, + postprocess: Callable[[_FinalProxy, Any, Any], Any] | None = None, + bases: tuple = (), + metaclasses: tuple = (), +) -> type[_FinalProxy]: + """ + Defines a fast-slow proxy type for a pair of "final" fast and slow + types. Final types are types for which known operations exist for + converting an object of "fast" type to "slow" and vice-versa. + + Parameters + ---------- + name: str + The name of the class returned + fast_type: type + slow_type: type + fast_to_slow: callable + Function that accepts a single argument of type `fast_type` + and returns an object of type `slow_type` + slow_to_fast: callable + Function that accepts a single argument of type `slow_type` + and returns an object of type `fast_type` + additional_attributes + Mapping of additional attributes to add to the class + (optional), these will override any defaulted attributes (e.g. + ``__init__`). If you want to remove a defaulted attribute + completely, pass the special sentinel ``_DELETE`` as a value. + postprocess + Optional function called to allow the proxy to postprocess + itself when being wrapped up, called with the proxy object, + the unwrapped result object, and the function that was used to + construct said unwrapped object. See also `_maybe_wrap_result`. + bases + Optional tuple of base classes to insert into the mro. + metaclasses + Optional tuple of metaclasses to unify with the base proxy metaclass. + + Notes + ----- + As a side-effect, this function adds `fast_type` and `slow_type` + to a global mapping of final types to their corresponding proxy + types, accessible via `get_final_type_map()`. + """ + + def __init__(self, *args, **kwargs): + _fast_slow_function_call( + lambda cls, args, kwargs: setattr( + self, "_fsproxy_wrapped", cls(*args, **kwargs) + ), + type(self), + args, + kwargs, + ) + + @nvtx.annotate( + "COPY_SLOW_TO_FAST", + color=_CUML_ACCEL_NVTX_COLORS["COPY_SLOW_TO_FAST"], + domain="cudf_pandas", + ) + def _fsproxy_slow_to_fast(self): + # if we are wrapping a slow object, + # convert it to a fast one + if self._fsproxy_state is _State.SLOW: + return slow_to_fast(self._fsproxy_wrapped) + return self._fsproxy_wrapped + + @nvtx.annotate( + "COPY_FAST_TO_SLOW", + color=_CUML_ACCEL_NVTX_COLORS["COPY_FAST_TO_SLOW"], + domain="cudf_pandas", + ) + def _fsproxy_fast_to_slow(self): + # if we are wrapping a fast object, + # convert it to a slow one + if self._fsproxy_state is _State.FAST: + return fast_to_slow(self._fsproxy_wrapped) + return self._fsproxy_wrapped + + @property # type: ignore + def _fsproxy_state(self) -> _State: + return ( + _State.FAST + if isinstance(self._fsproxy_wrapped, self._fsproxy_fast_type) + else _State.SLOW + ) + + slow_dir = dir(slow_type) + cls_dict = { + "__init__": __init__, + "__doc__": inspect.getdoc(slow_type), + "_fsproxy_slow_dir": slow_dir, + "_fsproxy_fast_type": fast_type, + "_fsproxy_slow_type": slow_type, + "_fsproxy_slow_to_fast": _fsproxy_slow_to_fast, + "_fsproxy_fast_to_slow": _fsproxy_fast_to_slow, + "_fsproxy_state": _fsproxy_state, + } + + if additional_attributes is None: + additional_attributes = {} + for method in _SPECIAL_METHODS: + if getattr(slow_type, method, False): + cls_dict[method] = _FastSlowAttribute(method) + for k, v in additional_attributes.items(): + if v is _DELETE and k in cls_dict: + del cls_dict[k] + elif v is not _DELETE: + cls_dict[k] = v + + for slow_name in dir(slow_type): + if slow_name in cls_dict or slow_name.startswith("__"): + continue + else: + cls_dict[slow_name] = _FastSlowAttribute( + slow_name, private=slow_name.startswith("_") + ) + + metaclass = _FastSlowProxyMeta + if metaclasses: + metaclass = types.new_class( # type: ignore + f"{name}_Meta", + metaclasses + (_FastSlowProxyMeta,), + {}, + ) + cls = types.new_class( + name, + (*bases, _FinalProxy), + {"metaclass": metaclass}, + lambda ns: ns.update(cls_dict), + ) + functools.update_wrapper( + cls, + slow_type, + assigned=_WRAPPER_ASSIGNMENTS, + updated=(), + ) + cls.__module__ = module if module is not None else callers_module_name() + + final_type_map = get_final_type_map() + if fast_type is not _Unusable: + final_type_map[fast_type] = cls + final_type_map[slow_type] = cls + + return cls + + +def make_intermediate_proxy_type( + name: str, + fast_type: type, + slow_type: type, + *, + module: str | None = None, +) -> type[_IntermediateProxy]: + """ + Defines a proxy type for a pair of "intermediate" fast and slow + types. Intermediate types are the types of the results of + operations invoked on final types. + + As a side-effect, this function adds `fast_type` and `slow_type` + to a global mapping of intermediate types to their corresponding + proxy types, accessible via `get_intermediate_type_map()`. + + Parameters + ---------- + name: str + The name of the class returned + fast_type: type + slow_type: type + """ + + def __init__(self, *args, **kwargs): + # disallow __init__. An intermediate proxy type can only be + # instantiated from (possibly chained) operations on a final + # proxy type. + raise TypeError( + f"Cannot directly instantiate object of type {type(self)}" + ) + + @property # type: ignore + def _fsproxy_state(self): + return ( + _State.FAST + if isinstance(self._fsproxy_wrapped, self._fsproxy_fast_type) + else _State.SLOW + ) + + @nvtx.annotate( + "COPY_SLOW_TO_FAST", + color=_CUML_ACCEL_NVTX_COLORS["COPY_SLOW_TO_FAST"], + domain="cudf_pandas", + ) + def _fsproxy_slow_to_fast(self): + if self._fsproxy_state is _State.SLOW: + return super(type(self), self)._fsproxy_slow_to_fast() + return self._fsproxy_wrapped + + @nvtx.annotate( + "COPY_FAST_TO_SLOW", + color=_CUML_ACCEL_NVTX_COLORS["COPY_FAST_TO_SLOW"], + domain="cudf_pandas", + ) + def _fsproxy_fast_to_slow(self): + if self._fsproxy_state is _State.FAST: + return super(type(self), self)._fsproxy_fast_to_slow() + return self._fsproxy_wrapped + + slow_dir = dir(slow_type) + cls_dict = { + "__init__": __init__, + "__doc__": inspect.getdoc(slow_type), + "_fsproxy_slow_dir": slow_dir, + "_fsproxy_fast_type": fast_type, + "_fsproxy_slow_type": slow_type, + "_fsproxy_slow_to_fast": _fsproxy_slow_to_fast, + "_fsproxy_fast_to_slow": _fsproxy_fast_to_slow, + "_fsproxy_state": _fsproxy_state, + } + for method in _SPECIAL_METHODS: + if getattr(slow_type, method, False): + cls_dict[method] = _FastSlowAttribute(method) + + for slow_name in dir(slow_type): + if slow_name in cls_dict or slow_name.startswith("__"): + continue + else: + cls_dict[slow_name] = _FastSlowAttribute( + slow_name, private=slow_name.startswith("_") + ) + + for slow_name in getattr(slow_type, "_attributes", []): + if slow_name in cls_dict: + continue + else: + cls_dict[slow_name] = _FastSlowAttribute( + slow_name, private=slow_name.startswith("_") + ) + + cls = types.new_class( + name, + (_IntermediateProxy,), + {"metaclass": _FastSlowProxyMeta}, + lambda ns: ns.update(cls_dict), + ) + functools.update_wrapper( + cls, + slow_type, + assigned=_WRAPPER_ASSIGNMENTS, + updated=(), + ) + cls.__module__ = module if module is not None else callers_module_name() + + intermediate_type_map = get_intermediate_type_map() + if fast_type is not _Unusable: + intermediate_type_map[fast_type] = cls + intermediate_type_map[slow_type] = cls + + return cls + + +def register_proxy_func(slow_func: Callable): + """ + Decorator to register custom function as a proxy for slow_func. + + Parameters + ---------- + slow_func: Callable + The function to register a wrapper for. + + Returns + ------- + Callable + """ + + def wrapper(func): + registered_functions = get_registered_functions() + registered_functions[slow_func] = func + functools.update_wrapper(func, slow_func) + return func + + return wrapper + + +@functools.lru_cache(maxsize=None) +def get_final_type_map(): + """ + Return the mapping of all known fast and slow final types to their + corresponding proxy types. + """ + return dict() + + +@functools.lru_cache(maxsize=None) +def get_intermediate_type_map(): + """ + Return a mapping of all known fast and slow intermediate types to their + corresponding proxy types. + """ + return dict() + + +@functools.lru_cache(maxsize=None) +def get_registered_functions(): + return dict() + + +def _raise_attribute_error(obj, name): + """ + Raise an AttributeError with a message that is consistent with + the error raised by Python for a non-existent attribute on a + proxy object. + """ + raise AttributeError(f"'{obj}' object has no attribute '{name}'") + + +class _FastSlowProxyMeta(type): + """ + Metaclass used to dynamically find class attributes and + classmethods of fast-slow proxy types. + """ + + _fsproxy_slow_dir: list + _fsproxy_slow_type: type + _fsproxy_fast_type: type + + @property + def _fsproxy_slow(self) -> type: + return self._fsproxy_slow_type + + @property + def _fsproxy_fast(self) -> type: + return self._fsproxy_fast_type + + def __dir__(self): + # Try to return the cached dir of the slow object, but if it + # doesn't exist, fall back to the default implementation. + try: + return self._fsproxy_slow_dir + except AttributeError: + return type.__dir__(self) + + def __subclasscheck__(self, __subclass: type) -> bool: + if super().__subclasscheck__(__subclass): + return True + if hasattr(__subclass, "_fsproxy_slow"): + return issubclass(__subclass._fsproxy_slow, self._fsproxy_slow) + return False + + def __instancecheck__(self, __instance: Any) -> bool: + if super().__instancecheck__(__instance): + return True + elif hasattr(type(__instance), "_fsproxy_slow"): + return issubclass(type(__instance), self) + return False + + +class _FastSlowProxy: + """ + Base class for all fast=slow proxy types. + + A fast-slow proxy is proxy for a pair of types that provide "fast" + and "slow" implementations of the same API. At any time, a + fast-slow proxy wraps an object of either "fast" type, or "slow" + type. Operations invoked on the fast-slow proxy are first + delegated to the "fast" type, and if that fails, to the "slow" + type. + """ + + _fsproxy_wrapped: Any + + def _fsproxy_fast_to_slow(self) -> Any: + """ + If the wrapped object is of "fast" type, returns the + corresponding "slow" object. Otherwise, returns the wrapped + object as-is. + """ + raise NotImplementedError("Abstract base class") + + def _fsproxy_slow_to_fast(self) -> Any: + """ + If the wrapped object is of "slow" type, returns the + corresponding "fast" object. Otherwise, returns the wrapped + object as-is. + """ + raise NotImplementedError("Abstract base class") + + @property + def _fsproxy_fast(self) -> Any: + """ + Returns the wrapped object. If the wrapped object is of "slow" + type, replaces it with the corresponding "fast" object before + returning it. + """ + self._fsproxy_wrapped = self._fsproxy_slow_to_fast() + return self._fsproxy_wrapped + + @property + def _fsproxy_slow(self) -> Any: + """ + Returns the wrapped object. If the wrapped object is of "fast" + type, replaces it with the corresponding "slow" object before + returning it. + """ + self._fsproxy_wrapped = self._fsproxy_fast_to_slow() + return self._fsproxy_wrapped + + def __dir__(self): + # Try to return the cached dir of the slow object, but if it + # doesn't exist, fall back to the default implementation. + try: + return self._fsproxy_slow_dir + except AttributeError: + return object.__dir__(self) + + def __setattr__(self, name, value): + if name.startswith("_"): + object.__setattr__(self, name, value) + return + return _FastSlowAttribute("__setattr__").__get__(self, type(self))( + name, value + ) + + +class _FinalProxy(_FastSlowProxy): + """ + Proxy type for a pair of fast and slow "final" types for which + there is a known conversion from fast to slow, and vice-versa. + The conversion between fast and slow types is done using + user-provided conversion functions. + + Do not attempt to use this class directly. Instead, use + `make_final_proxy_type` to create subtypes. + """ + + @classmethod + def _fsproxy_wrap(cls, value, func): + """Default mechanism to wrap a value in a proxy type + + Parameters + ---------- + cls + The proxy type + value + The value to wrap up + func + The function called that constructed value + + Returns + ------- + A new proxied object + + Notes + ----- + _FinalProxy subclasses can override this classmethod if they + need particular behaviour when wrapped up. + """ + proxy = object.__new__(cls) + proxy._fsproxy_wrapped = value + return proxy + + def __reduce__(self): + """ + In conjunction with `__proxy_setstate__`, this effectively enables + proxy types to be pickled and unpickled by pickling and unpickling + the underlying wrapped types. + """ + # Need a local import to avoid circular import issues + from .module_accelerator import disable_module_accelerator + + with disable_module_accelerator(): + pickled_wrapped_obj = pickle.dumps(self._fsproxy_wrapped) + return (_PickleConstructor(type(self)), (), pickled_wrapped_obj) + + def __setstate__(self, state): + # Need a local import to avoid circular import issues + from .module_accelerator import disable_module_accelerator + + with disable_module_accelerator(): + unpickled_wrapped_obj = pickle.loads(state) + self._fsproxy_wrapped = unpickled_wrapped_obj + + +class _IntermediateProxy(_FastSlowProxy): + """ + Proxy type for a pair of "intermediate" types that appear as + intermediate values when invoking operations on "final" types. + The conversion between fast and slow types is done by keeping + track of the sequence of operations that created the wrapped + object, and "playing back" that sequence starting from the "slow" + version of the originating _FinalProxy. + + Do not attempt to use this class directly. Instead, use + `make_intermediate_proxy_type` to create subtypes. + """ + + _method_chain: tuple[Callable, tuple, dict] + + @classmethod + def _fsproxy_wrap( + cls, + obj: Any, + method_chain: tuple[Callable, tuple, dict], + ): + """ + Parameters + ---------- + obj: The object to wrap + method_chain: A tuple of the form (func, args, kwargs) where + `func` is the function that was called to create `obj`, + and `args` and `kwargs` are the arguments that were passed + to `func`. + """ + proxy = object.__new__(cls) + proxy._fsproxy_wrapped = obj + proxy._method_chain = method_chain + return proxy + + @nvtx.annotate( + "COPY_SLOW_TO_FAST", + color=_CUML_ACCEL_NVTX_COLORS["COPY_SLOW_TO_FAST"], + domain="cudf_pandas", + ) + def _fsproxy_slow_to_fast(self) -> Any: + func, args, kwargs = self._method_chain + args, kwargs = _fast_arg(args), _fast_arg(kwargs) + return func(*args, **kwargs) + + @nvtx.annotate( + "COPY_FAST_TO_SLOW", + color=_CUML_ACCEL_NVTX_COLORS["COPY_FAST_TO_SLOW"], + domain="cudf_pandas", + ) + def _fsproxy_fast_to_slow(self) -> Any: + func, args, kwargs = self._method_chain + args, kwargs = _slow_arg(args), _slow_arg(kwargs) + return func(*args, **kwargs) + + def __reduce__(self): + """ + In conjunction with `__proxy_setstate__`, this effectively enables + proxy types to be pickled and unpickled by pickling and unpickling + the underlying wrapped types. + """ + # Need a local import to avoid circular import issues + from .module_accelerator import disable_module_accelerator + + with disable_module_accelerator(): + pickled_wrapped_obj = pickle.dumps(self._fsproxy_wrapped) + pickled_method_chain = pickle.dumps(self._method_chain) + return ( + _PickleConstructor(type(self)), + (), + (pickled_wrapped_obj, pickled_method_chain), + ) + + def __setstate__(self, state): + # Need a local import to avoid circular import issues + from .module_accelerator import disable_module_accelerator + + with disable_module_accelerator(): + unpickled_wrapped_obj = pickle.loads(state[0]) + unpickled_method_chain = pickle.loads(state[1]) + self._fsproxy_wrapped = unpickled_wrapped_obj + self._method_chain = unpickled_method_chain + + +class _CallableProxyMixin: + """ + Mixin class that implements __call__ for fast-slow proxies. + """ + + # For wrapped callables isinstance(self, FunctionType) should return True + __class__ = types.FunctionType # type: ignore + + def __call__(self, *args, **kwargs) -> Any: + result, _ = _fast_slow_function_call( + # We cannot directly call self here because we need it to be + # converted into either the fast or slow object (by + # _fast_slow_function_call) to avoid infinite recursion. + # TODO: When Python 3.11 is the minimum supported Python version + # this can use operator.call + call_operator, + self, + args, + kwargs, + ) + return result + + +class _FunctionProxy(_CallableProxyMixin): + """ + Proxy for a pair of fast and slow functions. + """ + + __name__: str + + def __init__( + self, + fast: Callable | _Unusable, + slow: Callable, + *, + assigned=None, + updated=None, + ): + self._fsproxy_fast = fast + self._fsproxy_slow = slow + if assigned is None: + assigned = functools.WRAPPER_ASSIGNMENTS + if updated is None: + updated = functools.WRAPPER_UPDATES + functools.update_wrapper( + self, + slow, + assigned=assigned, + updated=updated, + ) + + def __reduce__(self): + """ + In conjunction with `__proxy_setstate__`, this effectively enables + proxy types to be pickled and unpickled by pickling and unpickling + the underlying wrapped types. + """ + # Need a local import to avoid circular import issues + from .module_accelerator import disable_module_accelerator + + with disable_module_accelerator(): + pickled_fast = pickle.dumps(self._fsproxy_fast) + pickled_slow = pickle.dumps(self._fsproxy_slow) + return ( + _PickleConstructor(type(self)), + (), + (pickled_fast, pickled_slow), + ) + + def __setstate__(self, state): + # Need a local import to avoid circular import issues + from .module_accelerator import disable_module_accelerator + + with disable_module_accelerator(): + unpickled_fast = pickle.loads(state[0]) + unpickled_slow = pickle.loads(state[1]) + self._fsproxy_fast = unpickled_fast + self._fsproxy_slow = unpickled_slow + + +def is_bound_method(obj): + return inspect.ismethod(obj) and not inspect.isfunction(obj) + + +def is_function(obj): + return inspect.isfunction(obj) or isinstance(obj, types.FunctionType) + + +class _FastSlowAttribute: + """ + A descriptor type used to define attributes of fast-slow proxies. + """ + + _attr: Any + + def __init__(self, name: str, *, private: bool = False): + self._name = name + self._private = private + self._attr = None + self._doc = None + self._dir = None + + def __get__(self, instance, owner) -> Any: + from .module_accelerator import disable_module_accelerator + + if self._attr is None: + if self._private: + fast_attr = _Unusable() + else: + fast_attr = getattr( + owner._fsproxy_fast, self._name, _Unusable() + ) + + try: + slow_attr = getattr(owner._fsproxy_slow, self._name) + except AttributeError as e: + if instance is not None: + return _maybe_wrap_result( + getattr(instance._fsproxy_slow, self._name), + None, # type: ignore + ) + else: + raise e + + if _is_function_or_method(slow_attr): + self._attr = _MethodProxy(fast_attr, slow_attr) + else: + # for anything else, use a fast-slow attribute: + self._attr, _ = _fast_slow_function_call( + getattr, + owner, + self._name, + ) + + if isinstance( + self._attr, (property, functools.cached_property) + ): + with disable_module_accelerator(): + self._attr.__doc__ = inspect.getdoc(slow_attr) + + if instance is not None: + if isinstance(self._attr, _MethodProxy): + if is_bound_method(self._attr._fsproxy_slow): + return self._attr + else: + return types.MethodType(self._attr, instance) + else: + if self._private: + return _maybe_wrap_result( + getattr(instance._fsproxy_slow, self._name), + None, # type: ignore + ) + return _fast_slow_function_call( + getattr, + instance, + self._name, + )[0] + return self._attr + + +class _MethodProxy(_FunctionProxy): + def __init__(self, fast, slow): + super().__init__( + fast, + slow, + updated=functools.WRAPPER_UPDATES, + assigned=( + tuple(filter(lambda x: x != "__name__", _WRAPPER_ASSIGNMENTS)) + ), + ) + + def __dir__(self): + return self._fsproxy_slow.__dir__() + + @property + def __doc__(self): + return self._fsproxy_slow.__doc__ + + @property + def __name__(self): + return self._fsproxy_slow.__name__ + + @__name__.setter + def __name__(self, value): + try: + setattr(self._fsproxy_fast, "__name__", value) + except AttributeError: + pass + setattr(self._fsproxy_slow, "__name__", value) + + +# def _assert_fast_slow_eq(left, right): +# if _is_final_type(type(left)) or type(left) in NUMPY_TYPES: +# assert_eq(left, right) + + +def _fast_slow_function_call( + func: Callable, + /, + *args, + **kwargs, +) -> Any: + """ + Call `func` with all `args` and `kwargs` converted to their + respective fast type. If that fails, call `func` with all + `args` and `kwargs` converted to their slow type. + + Wrap the result in a fast-slow proxy if it is a type we know how + to wrap. + """ + from .module_accelerator import disable_module_accelerator + + fast = False + try: + with nvtx.annotate( + "EXECUTE_FAST", + color=_CUML_ACCEL_NVTX_COLORS["EXECUTE_FAST"], + domain="cudf_pandas", + ): + fast_args, fast_kwargs = _fast_arg(args), _fast_arg(kwargs) + result = func(*fast_args, **fast_kwargs) + if result is NotImplemented: + # try slow path + raise Exception() + fast = True + if _env_get_bool("CUDF_PANDAS_DEBUGGING", False): + try: + with nvtx.annotate( + "EXECUTE_SLOW_DEBUG", + color=_CUML_ACCEL_NVTX_COLORS["EXECUTE_SLOW"], + domain="cudf_pandas", + ): + slow_args, slow_kwargs = ( + _slow_arg(args), + _slow_arg(kwargs), + ) + with disable_module_accelerator(): + slow_result = func( # noqa:F841 + *slow_args, **slow_kwargs + ) # noqa + except Exception as e: + warnings.warn( + "The result from pandas could not be computed. " + f"The exception was {e}." + ) + # else: + # try: + # _assert_fast_slow_eq(result, slow_result) + # except AssertionError as e: + # warnings.warn( + # "The results from cudf and pandas were different. " + # f"The exception was {e}." + # ) + # except Exception as e: + # warnings.warn( + # "Pandas debugging mode failed. " + # f"The exception was {e}." + # ) + except Exception as err: + with nvtx.annotate( + "EXECUTE_SLOW", + color=_CUML_ACCEL_NVTX_COLORS["EXECUTE_SLOW"], + domain="cudf_pandas", + ): + slow_args, slow_kwargs = _slow_arg(args), _slow_arg(kwargs) + if _env_get_bool("LOG_FAST_FALLBACK", False): + from ._logger import log_fallback + + log_fallback(slow_args, slow_kwargs, err) + with disable_module_accelerator(): + result = func(*slow_args, **slow_kwargs) + return _maybe_wrap_result(result, func, *args, **kwargs), fast + + +def _transform_arg( + arg: Any, + attribute_name: Literal["_fsproxy_slow", "_fsproxy_fast"], + seen: set[int], +) -> Any: + """ + Transform "arg" into its corresponding slow (or fast) type. + """ + import numpy as np + + if isinstance(arg, (_FastSlowProxy, _FastSlowProxyMeta, _FunctionProxy)): + typ = getattr(arg, attribute_name) + if typ is _Unusable: + raise Exception("Cannot transform _Unusable") + return typ + elif isinstance(arg, types.ModuleType) and attribute_name in arg.__dict__: + return arg.__dict__[attribute_name] + elif isinstance(arg, list): + return type(arg)(_transform_arg(a, attribute_name, seen) for a in arg) + elif isinstance(arg, tuple): + # This attempts to handle arbitrary subclasses of tuple by + # assuming that if you've subclassed tuple with some special + # behaviour you'll also make the object pickleable by + # implementing the custom pickle protocol interface (either + # __getnewargs_ex__ or __getnewargs__). Perhaps this should + # use __reduce_ex__ instead... + if type(arg) is tuple: + # Must come first to avoid infinite recursion + return tuple(_transform_arg(a, attribute_name, seen) for a in arg) + elif hasattr(arg, "__getnewargs_ex__"): + # Partial implementation of to reconstruct with + # transformed pieces + # This handles scipy._lib._bunch._make_tuple_bunch + args, kwargs = ( + _transform_arg(a, attribute_name, seen) + for a in arg.__getnewargs_ex__() + ) + obj = type(arg).__new__(type(arg), *args, **kwargs) + if hasattr(obj, "__setstate__"): + raise NotImplementedError( + "Transforming tuple-like with __getnewargs_ex__ and " + "__setstate__ not implemented" + ) + if not hasattr(obj, "__dict__") and kwargs: + raise NotImplementedError( + "Transforming tuple-like with kwargs from " + "__getnewargs_ex__ and no __dict__ not implemented" + ) + obj.__dict__.update(kwargs) + return obj + elif hasattr(arg, "__getnewargs__"): + # This handles namedtuple, and would catch tuple if we + # didn't handle it above. + args = _transform_arg(arg.__getnewargs__(), attribute_name, seen) + return type(arg).__new__(type(arg), *args) + else: + # Hope we can just call the constructor with transformed entries. + return type(arg)( + _transform_arg(a, attribute_name, seen) for a in args + ) + elif isinstance(arg, dict): + return { + _transform_arg(k, attribute_name, seen): _transform_arg( + a, attribute_name, seen + ) + for k, a in arg.items() + } + elif isinstance(arg, np.ndarray) and arg.dtype == "O": + transformed = [ + _transform_arg(a, attribute_name, seen) for a in arg.flat + ] + # Keep the same memory layout as arg (the default is C_CONTIGUOUS) + if arg.flags["F_CONTIGUOUS"] and not arg.flags["C_CONTIGUOUS"]: + order = "F" + else: + order = "C" + result = np.empty(int(np.prod(arg.shape)), dtype=object, order=order) + result[...] = transformed + return result.reshape(arg.shape) + elif isinstance(arg, Iterator) and attribute_name == "_fsproxy_fast": + # this may include consumable objects like generators or + # IOBase objects, which we don't want unavailable to the slow + # path in case of fallback. So, we raise here and ensure the + # slow path is taken: + raise Exception() + elif isinstance(arg, types.FunctionType): + if id(arg) in seen: + # `arg` is mutually recursive with another function. We + # can't handle these cases yet: + return arg + seen.add(id(arg)) + return _replace_closurevars(arg, attribute_name, seen) + else: + return arg + + +def _fast_arg(arg: Any) -> Any: + """ + Transform "arg" into its corresponding fast type. + """ + seen: set[int] = set() + return _transform_arg(arg, "_fsproxy_fast", seen) + + +def _slow_arg(arg: Any) -> Any: + """ + Transform "arg" into its corresponding slow type. + """ + seen: set[int] = set() + return _transform_arg(arg, "_fsproxy_slow", seen) + + +def _maybe_wrap_result(result: Any, func: Callable, /, *args, **kwargs) -> Any: + """ + Wraps "result" in a fast-slow proxy if is a "proxiable" object. + """ + if _is_final_type(result): + typ = get_final_type_map()[type(result)] + return typ._fsproxy_wrap(result, func) + elif _is_intermediate_type(result): + typ = get_intermediate_type_map()[type(result)] + return typ._fsproxy_wrap(result, method_chain=(func, args, kwargs)) + elif _is_final_class(result): + return get_final_type_map()[result] + elif isinstance(result, list): + return type(result)( + [ + _maybe_wrap_result(r, operator.getitem, result, i) + for i, r in enumerate(result) + ] + ) + elif isinstance(result, tuple): + wrapped = ( + _maybe_wrap_result(r, operator.getitem, result, i) + for i, r in enumerate(result) + ) + if hasattr(result, "_make"): + # namedtuple + return type(result)._make(wrapped) + else: + return type(result)(wrapped) + elif isinstance(result, Iterator): + return (_maybe_wrap_result(r, lambda x: x, r) for r in result) + else: + return result + + +def _is_final_type(result: Any) -> bool: + return type(result) in get_final_type_map() + + +def _is_final_class(result: Any) -> bool: + if not isinstance(result, type): + return False + return result in get_final_type_map() + + +def _is_intermediate_type(result: Any) -> bool: + return type(result) in get_intermediate_type_map() + + +def _is_function_or_method(obj: Any) -> bool: + res = isinstance( + obj, + ( + types.FunctionType, + types.BuiltinFunctionType, + types.MethodType, + types.WrapperDescriptorType, + types.MethodWrapperType, + types.MethodDescriptorType, + types.BuiltinMethodType, + ), + ) + if not res: + try: + return "cython_function_or_method" in str(type(obj)) + except Exception: + return False + return res + + +def _replace_closurevars( + f: types.FunctionType, + attribute_name: Literal["_fsproxy_slow", "_fsproxy_fast"], + seen: set[int], +) -> Callable[..., Any]: + """ + Return a copy of `f` with its closure variables replaced with + their corresponding slow (or fast) types. + """ + if f.__closure__: + # GH #254: If empty cells are present - which can happen in + # situations like when `f` is a method that invokes the + # "empty" `super()` - the call to `getclosurevars` below will + # fail. For now, we just return `f` in this case. If needed, + # we can consider populating empty cells with a placeholder + # value to allow the call to `getclosurevars` to succeed. + if any(c == types.CellType() for c in f.__closure__): + return f + + f_nonlocals, f_globals, _, _ = inspect.getclosurevars(f) + + g_globals = _transform_arg(f_globals, attribute_name, seen) + g_nonlocals = _transform_arg(f_nonlocals, attribute_name, seen) + + # if none of the globals/nonlocals were transformed, we + # can just return f: + if all(f_globals[k] is g_globals[k] for k in f_globals) and all( + g_nonlocals[k] is f_nonlocals[k] for k in f_nonlocals + ): + return f + + g_closure = tuple(types.CellType(val) for val in g_nonlocals.values()) + + # https://github.com/rapidsai/cudf/issues/15548 + new_g_globals = f.__globals__.copy() + new_g_globals.update(g_globals) + + g = types.FunctionType( + f.__code__, + new_g_globals, + name=f.__name__, + argdefs=f.__defaults__, + closure=g_closure, + ) + return functools.update_wrapper( + g, + f, + assigned=functools.WRAPPER_ASSIGNMENTS + ("__kwdefaults__",), + ) + + +def is_proxy_object(obj: Any) -> bool: + """Determine if an object is proxy object + + Parameters + ---------- + obj : object + Any python object. + + """ + if _FastSlowProxyMeta in type(type(obj)).__mro__: + return True + return False + + +NUMPY_TYPES: set[str] = set(np.sctypeDict.values()) + + +_SPECIAL_METHODS: set[str] = {} diff --git a/python/cuml/cuml/experimental/accel/magics.py b/python/cuml/cuml/experimental/accel/magics.py new file mode 100644 index 0000000000..77c4851e59 --- /dev/null +++ b/python/cuml/cuml/experimental/accel/magics.py @@ -0,0 +1,45 @@ +# +# Copyright (c) 2024, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + + +try: + from IPython.core.magic import Magics, cell_magic, magics_class + + # from .profiler import Profiler, lines_with_profiling + + # @magics_class + # class CumlAccelMagic(Magics): + # @cell_magic("cuml.accelerator.profile") + # def profile(self, _, cell): + # with Profiler() as profiler: + # get_ipython().run_cell(cell) # noqa: F821 + # profiler.print_per_function_stats() + + # @cell_magic("cuml.accelerator.line_profile") + # def line_profile(self, _, cell): + # new_cell = lines_with_profiling(cell.split("\n")) + # get_ipython().run_cell(new_cell) # noqa: F821 + + def load_ipython_extension(ip): + from . import install + + install() + # ip.register_magics(CumlAccelMagic) + +except ImportError: + + def load_ipython_extension(ip): + pass diff --git a/python/cuml/cuml/experimental/accel/module_accelerator.py b/python/cuml/cuml/experimental/accel/module_accelerator.py new file mode 100644 index 0000000000..00259ddc08 --- /dev/null +++ b/python/cuml/cuml/experimental/accel/module_accelerator.py @@ -0,0 +1,664 @@ +# +# Copyright (c) 2024, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +from __future__ import annotations + +import contextlib +import functools +import importlib +import importlib.abc +import importlib.machinery +import os +import pathlib +import sys +import threading +import warnings +from abc import abstractmethod +from importlib._bootstrap import _ImportLockContext as ImportLock +from types import ModuleType +from typing import Any, ContextManager, NamedTuple + +from typing_extensions import Self + +from .fast_slow_proxy import ( + _FunctionProxy, + _is_function_or_method, + _Unusable, + get_final_type_map, + get_intermediate_type_map, + get_registered_functions, +) +from ._wrappers import wrapped_estimators + +from cuml.internals import logger + + +def rename_root_module(module: str, root: str, new_root: str) -> str: + """ + Rename a module to a new root. + + Parameters + ---------- + module + Module to rename + root + Original root + new_root + New root + + Returns + ------- + New module name (if it matches root) otherwise original name. + """ + if module.startswith(root): + return new_root + module[len(root) :] + else: + return module + + +class DeducedMode(NamedTuple): + use_fast_lib: bool + slow_lib: str + fast_lib: str + + +def deduce_cuml_accel_mode(slow_lib: str, fast_lib: str) -> DeducedMode: + """ + Determine if cudf.pandas should use the requested fast library. + + Parameters + ---------- + slow_lib + Name of the slow library + fast_lib + Name of the fast library + + Returns + ------- + Whether the fast library is being used, and the resulting names of + the "slow" and "fast" libraries. + """ + if "CUML_FALLBACK_MODE" not in os.environ: + try: + importlib.import_module(fast_lib) + return DeducedMode( + use_fast_lib=True, slow_lib=slow_lib, fast_lib=fast_lib + ) + except Exception as e: + warnings.warn( + f"Exception encountered importing {fast_lib}: {e}." + f"Falling back to only using {slow_lib}." + ) + return DeducedMode( + use_fast_lib=False, slow_lib=slow_lib, fast_lib=slow_lib + ) + + +class ModuleAcceleratorBase( + importlib.abc.MetaPathFinder, importlib.abc.Loader +): + _instance: ModuleAcceleratorBase | None = None + mod_name: str + fast_lib: str + slow_lib: str + + # When walking the module tree and wrapping module attributes, + # we often will come across the same object more than once. We + # don't want to create separate wrappers for each + # instance, so we keep a registry of all module attributes + # that we can look up to see if we have already wrapped an + # attribute before + _wrapped_objs: dict[Any, Any] + + def __new__( + cls, + mod_name: str, + fast_lib: str, + slow_lib: str, + ): + """Build a custom module finder that will provide wrapped modules + on demand. + + Parameters + ---------- + mod_name + Import name to deliver modules under. + fast_lib + Name of package that provides "fast" implementation + slow_lib + Name of package that provides "slow" fallback implementation + """ + # todo (dgd) replace this check for raising only when initializing + # a loader for an already module-accelerated slow_lib + # if ModuleAcceleratorBase._instance is not None: + # raise RuntimeError( + # "Only one instance of ModuleAcceleratorBase allowed" + # ) + self = object.__new__(cls) + self.mod_name = mod_name + self.fast_lib = fast_lib + self.slow_lib = slow_lib + + # When walking the module tree and wrapping module attributes, + # we often will come across the same object more than once. We + # don't want to create separate wrappers for each + # instance, so we keep a registry of all module attributes + # that we can look up to see if we have already wrapped an + # attribute before + self._wrapped_objs = {} + self._wrapped_objs.update(get_final_type_map()) + self._wrapped_objs.update(get_intermediate_type_map()) + self._wrapped_objs.update(get_registered_functions()) + + ModuleAcceleratorBase._instance = self + return self + + def __repr__(self) -> str: + return ( + f"{self.__class__.__name__}" + f"(fast={self.fast_lib}, slow={self.slow_lib})" + ) + + def find_spec( + self, fullname: str, path, target=None + ) -> importlib.machinery.ModuleSpec | None: + """Provide ourselves as a module loader. + + Parameters + ---------- + fullname + Name of module to be imported, if it starts with the name + that we are using to wrap, we will deliver ourselves as a + loader, otherwise defer to the standard Python loaders. + + Returns + ------- + A ModuleSpec with ourself as loader if we're interposing, + otherwise None to pass off to the next loader. + """ + if fullname == self.mod_name or fullname.startswith( + f"{self.mod_name}." + ): + return importlib.machinery.ModuleSpec( + name=fullname, + loader=self, + # Note, this influences the repr of the module, so we may want + # to change it if we ever want to control that. + origin=None, + loader_state=None, + is_package=True, + ) + return None + + def create_module(self, spec) -> ModuleType | None: + return None + + def exec_module(self, mod: ModuleType): + # importlib calls this function with the global import lock held. + self._populate_module(mod) + + @abstractmethod + def disabled(self) -> ContextManager: + pass + + def _postprocess_module( + self, + mod: ModuleType, + slow_mod: ModuleType, + fast_mod: ModuleType | None, + ) -> ModuleType: + """Ensure that the wrapped module satisfies required invariants. + + Parameters + ---------- + mod + Wrapped module to postprocess + slow_mod + Slow version that we are mimicking + fast_mod + Fast module that provides accelerated implementations (may + be None + + Returns + ------- + Checked and validated module + + Notes + ----- + The implementation of fast-slow proxies imposes certain + requirements on the wrapped modules that it delivers. This + function encodes those requirements and raises if the module + does not satisfy them. + + This post-processing routine should be kept up to date with any + requirements encoded by fast_slow_proxy.py + """ + mod.__dict__["_fsproxy_slow"] = slow_mod + if fast_mod is not None: + mod.__dict__["_fsproxy_fast"] = fast_mod + return mod + + @abstractmethod + def _populate_module(self, mod: ModuleType) -> ModuleType: + """Populate given module with appropriate attributes. + + This traverses the attributes of the slow module corresponding + to mod and mirrors those in the provided module in a wrapped + mode that attempts to execute them using the fast module first. + + Parameters + ---------- + mod + Module to populate + + Returns + ------- + ModuleType + Populated module + + Notes + ----- + In addition to the attributes of the slow module, + the returned module must have the following attributes: + + - '_fsproxy_slow': the corresponding slow module + - '_fsproxy_fast': the corresponding fast module + + This is necessary for correct rewriting of UDFs when calling + to the respective fast/slow libraries. + + The necessary invariants are checked and applied in + :meth:`_postprocess_module`. + """ + pass + + def _wrap_attribute( + self, + slow_attr: Any, + fast_attr: Any | _Unusable, + name: str, + ) -> Any: + """ + Return the wrapped version of an attribute. + + Parameters + ---------- + slow_attr : Any + The attribute from the slow module + fast_mod : Any (or None) + The same attribute from the fast module, if it exists + name + Name of attribute + + Returns + ------- + Wrapped attribute + """ + wrapped_attr: Any + # TODO: what else should we make sure not to get from the fast + # library? + if name in {"__all__", "__dir__", "__file__", "__doc__"}: + wrapped_attr = slow_attr + elif self.fast_lib == self.slow_lib: + # no need to create a fast-slow wrapper + wrapped_attr = slow_attr + if any( + [ + slow_attr in get_registered_functions(), + slow_attr in get_final_type_map(), + slow_attr in get_intermediate_type_map(), + ] + ): + # attribute already registered in self._wrapped_objs + return self._wrapped_objs[slow_attr] + if isinstance(slow_attr, ModuleType) and slow_attr.__name__.startswith( + self.slow_lib + ): + # attribute is a submodule of the slow library, + # replace the string "{slow_lib}" in the submodule's + # name with "{self.mod_name}" + # now, attempt to import the wrapped module, which will + # recursively wrap all of its attributes: + return importlib.import_module( + rename_root_module( + slow_attr.__name__, self.slow_lib, self.mod_name + ) + ) + if slow_attr in self._wrapped_objs: + if type(fast_attr) is _Unusable: + # we don't want to replace a wrapped object that + # has a usable fast object with a wrapped object + # with a an unusable fast object. + return self._wrapped_objs[slow_attr] + if name in wrapped_estimators: + + mod = importlib.import_module(wrapped_estimators[name][0]) + wrapped_attr = getattr(mod, wrapped_estimators[name][1]) + logger.debug(f"Patched {wrapped_attr}") + # elif _is_function_or_method(slow_attr): + # wrapped_attr = _FunctionProxy(fast_attr, slow_attr) + else: + wrapped_attr = slow_attr + return wrapped_attr + + @classmethod + @abstractmethod + def install( + cls, destination_module: str, fast_lib: str, slow_lib: str + ) -> Self | None: + """ + Install the loader in sys.meta_path. + + Parameters + ---------- + destination_module + Name under which the importer will kick in + fast_lib + Name of fast module + slow_lib + Name of slow module we are trying to mimic + + Returns + ------- + Instance of the class (or None if the loader was not installed) + + Notes + ----- + This function is idempotent. If called with the same arguments + a second time, it does not create a new loader, but instead + returns the existing loader from ``sys.meta_path``. + + """ + pass + + +class ModuleAccelerator(ModuleAcceleratorBase): + """ + A finder and loader that produces "accelerated" modules. + + When someone attempts to import the specified slow library with + this finder enabled, we intercept the import and deliver an + equivalent, accelerated, version of the module. This provides + attributes and modules that check if they are being used from + "within" the slow (or fast) library themselves. If this is the + case, the implementation is forwarded to the actual slow library + implementation, otherwise a proxy implementation is used (which + attempts to call the fast version first). + """ + + _denylist: tuple[str] + _use_fast_lib: bool + _use_fast_lib_lock: threading.RLock + _module_cache_prefix: str = "_slow_lib_" + + # TODO: Add possibility for either an explicit allow-list of + # libraries where the slow_lib should be wrapped, or, more likely + # a block-list that adds to the set of libraries where no proxying occurs. + def __new__( + cls, + fast_lib, + slow_lib, + ): + self = super().__new__( + cls, + slow_lib, + fast_lib, + slow_lib, + ) + # Import the real versions of the modules so that we can + # rewrite the sys.modules cache. + slow_module = importlib.import_module(slow_lib) + fast_module = importlib.import_module(fast_lib) + # Note, this is not thread safe, but install() below grabs the + # lock for the whole initialisation and modification of + # sys.meta_path. + for mod in sys.modules.copy(): + if mod.startswith(self.slow_lib): + sys.modules[self._module_cache_prefix + mod] = sys.modules[mod] + del sys.modules[mod] + self._denylist = (*slow_module.__path__, *fast_module.__path__) + + # Lock to manage temporarily disabling delivering wrapped attributes + self._use_fast_lib_lock = threading.RLock() + self._use_fast_lib = True + return self + + def _populate_module(self, mod: ModuleType): + mod_name = mod.__name__ + + # Here we attempt to import "_fsproxy_slow_lib.x.y.z", but + # "_fsproxy_slow_lib" does not exist anywhere as a real file, so + # how does this work? + # The importer attempts to import ".z" by first importing + # "_fsproxy_slow_lib.x.y", this recurses until we find + # "_fsproxy_slow_lib.x" (say), which does exist because we set that up + # in __init__. Now the importer looks at the __path__ + # attribute of "x" and uses that to find the relative location + # to look for "y". This __path__ points to the real location + # of "slow_lib.x". So, as long as we rewire the _already imported_ + # slow_lib modules in sys.modules to _fsproxy_slow_lib, when we + # get here this will find the right thing. + # The above exposition is for lazily imported submodules (e.g. + # avoiding circular imports by putting an import at function + # level). For everything that is eagerly imported when we do + # "import slow_lib" this import line is trivial because we + # immediately pull the correct result out of sys.modules. + + # mod_name, + # self.slow_lib, + # self._module_cache_prefix + self.slow_lib, + # )}") + slow_mod = importlib.import_module( + rename_root_module( + mod_name, + self.slow_lib, + self._module_cache_prefix + self.slow_lib, + ) + ) + try: + fast_mod = importlib.import_module( + rename_root_module(mod_name, self.slow_lib, self.fast_lib) + ) + except Exception: + fast_mod = None + + # The version that will be used if called within a denylist + # package + real_attributes = {} + # The version that will be used outside denylist packages + for key in slow_mod.__dir__(): + with warnings.catch_warnings(): + warnings.simplefilter("ignore", FutureWarning) + slow_attr = getattr(slow_mod, key) + fast_attr = getattr(fast_mod, key, _Unusable()) + real_attributes[key] = slow_attr + try: + wrapped_attr = self._wrap_attribute(slow_attr, fast_attr, key) + self._wrapped_objs[slow_attr] = wrapped_attr + except TypeError: + # slow_attr is not hashable + pass + + # Our module has (basically) no static attributes and instead + # always delivers them dynamically where the behaviour is + # dependent on the calling module. + setattr( + mod, + "__getattr__", + functools.partial( + self.getattr_real_or_wrapped, + real=real_attributes, + wrapped_objs=self._wrapped_objs, + loader=self, + ), + ) + + # ...but, we want to pretend like we expose the same attributes + # as the equivalent slow module + setattr(mod, "__dir__", slow_mod.__dir__) + + # We set __path__ to the real path so that importers like + # jinja2.PackageLoader("slow_mod") work correctly. + # Note (dgd): this doesn't work for resources.files(data_module) + if getattr(slow_mod, "__path__", False): + assert mod.__spec__ + mod.__path__ = slow_mod.__path__ + mod.__spec__.submodule_search_locations = [*slow_mod.__path__] + return self._postprocess_module(mod, slow_mod, fast_mod) + + @contextlib.contextmanager + def disabled(self): + """Return a context manager for disabling the module accelerator. + + Within the block, any wrapped objects will instead deliver + attributes from their real counterparts (as if the current + nested block were in the denylist). + + Returns + ------- + Context manager for disabling things + """ + try: + self._use_fast_lib_lock.acquire() + # The same thread might enter this context manager + # multiple times, so we need to remember the previous + # value + saved = self._use_fast_lib + self._use_fast_lib = False + yield + finally: + self._use_fast_lib = saved + self._use_fast_lib_lock.release() + + @staticmethod + def getattr_real_or_wrapped( + name: str, + *, + real: dict[str, Any], + wrapped_objs, + loader: ModuleAccelerator, + ) -> Any: + """ + Obtain an attribute from a module from either the real or + wrapped namespace. + + Parameters + ---------- + name + Attribute to return + real + Unwrapped "original" attributes + wrapped + Wrapped attributes + loader + Loader object that manages denylist and other skipping + + Returns + ------- + The requested attribute (either real or wrapped) + """ + with loader._use_fast_lib_lock: + # Have to hold the lock to read this variable since + # another thread might modify it. + # Modification has to happen with the lock held for the + # duration, so if someone else has modified things, then + # we block trying to acquire the lock (hence it is safe to + # release the lock after reading this value) + use_real = not loader._use_fast_lib + if not use_real: + # Only need to check the denylist if we're not turned off. + frame = sys._getframe() + # We cannot possibly be at the top level. + assert frame.f_back + calling_module = pathlib.PurePath(frame.f_back.f_code.co_filename) + use_real = _caller_in_denylist( + calling_module, tuple(loader._denylist) + ) + try: + if use_real: + return real[name] + else: + return wrapped_objs[real[name]] + except KeyError: + raise AttributeError(f"No attribute '{name}'") + except TypeError: + # real[name] is an unhashable type + return real[name] + + @classmethod + def install( + cls, + destination_module: str, + fast_lib: str, + slow_lib: str, + ) -> Self | None: + # This grabs the global _import_ lock to avoid concurrent + # threads modifying sys.modules. + # We also make sure that we finish installing ourselves in + # sys.meta_path before releasing the lock so that there isn't + # a race between our modification of sys.modules and someone + # else importing the slow_lib before we have added ourselves + # to the meta_path + with ImportLock(): + logger.debug("Module Accelerator Install") + logger.debug(f"destination_module: {destination_module}") + logger.debug(f"fast_lib: {fast_lib}") + logger.debug(f"slow_lib: {slow_lib}") + logger.info("Non Estimator Function Dispatching disabled...") + if destination_module != slow_lib: + raise RuntimeError( + f"Destination module '{destination_module}' must match" + f"'{slow_lib}' for this to work." + ) + mode = deduce_cuml_accel_mode(slow_lib, fast_lib) + if mode.use_fast_lib: + importlib.import_module( + f".._wrappers.{mode.slow_lib}", __name__ + ) + try: + (self,) = ( + p + for p in sys.meta_path + if isinstance(p, cls) + and p.slow_lib == mode.slow_lib + and p.fast_lib == mode.fast_lib + ) + except ValueError: + self = cls(mode.fast_lib, mode.slow_lib) + sys.meta_path.insert(0, self) + return self + + +def disable_module_accelerator() -> contextlib.ExitStack: + """ + Temporarily disable any module acceleration. + """ + with contextlib.ExitStack() as stack: + for finder in sys.meta_path: + if isinstance(finder, ModuleAcceleratorBase): + stack.enter_context(finder.disabled()) + return stack.pop_all() + assert False # pacify type checker + + +# because this function gets called so often and is quite +# expensive to run, we cache the results: +@functools.lru_cache(maxsize=1024) +def _caller_in_denylist(calling_module, denylist): + CUML_ACCELERATOR_PATH = __file__.rsplit("/", 1)[0] + return not calling_module.is_relative_to(CUML_ACCELERATOR_PATH) and any( + calling_module.is_relative_to(path) for path in denylist + ) diff --git a/python/cuml/cuml/internals/base.pyx b/python/cuml/cuml/internals/base.pyx index 9813acbba4..a361381968 100644 --- a/python/cuml/cuml/internals/base.pyx +++ b/python/cuml/cuml/internals/base.pyx @@ -202,6 +202,12 @@ class Base(TagsMixin, del base # optional! """ + _base_hyperparam_interop_translator = { + "n_jobs": "accept" + } + + _hyperparam_interop_translator = {} + def __init__(self, *, handle=None, verbose=False, @@ -471,6 +477,45 @@ class Base(TagsMixin, func = nvtx_annotate(message=msg, domain="cuml_python")(func) setattr(self, func_name, func) + @classmethod + def _hyperparam_translator(cls, **kwargs): + """ + This method is meant to do checks and translations of hyperparameters + at estimator creating time. + Each children estimator can override the method, returning either + modifier **kwargs with equivalent options, or + """ + gpu_hyperparams = cls._get_param_names() + kwargs.pop("self", None) + gpuaccel = True + for arg, value in kwargs.items(): + + if arg in cls._base_hyperparam_interop_translator: + if cls._base_hyperparam_interop_translator[arg] == "accept": + gpuaccel = gpuaccel and True + + elif arg in cls._hyperparam_interop_translator: + if value in cls._hyperparam_interop_translator[arg]: + if cls._hyperparam_interop_translator[arg][value] == "accept": + gpuaccel = gpuaccel and True + elif cls._hyperparam_interop_translator[arg][value] == "dispatch": + gpuaccel = False + else: + kwargs[arg] = cls._hyperparam_interop_translator[arg][value] + gpuaccel = gpuaccel and True + # todo (dgd): improve message + logger.warn("Value changed") + + else: + gpuaccel = gpuaccel and True + + # else: + # gpuaccel = False + + # we need to enable this if we enable translation for regular cuML + # kwargs["_gpuaccel"] = gpuaccel + return kwargs, gpuaccel + # Internal, non class owned helper functions def _check_output_type_str(output_str): @@ -681,11 +726,15 @@ class UniversalBase(Base): keyword arguments to be passed to the function for the call """ # look for current device_type - device_type = cuml.global_settings.device_type + # device_type = cuml.global_settings.device_type + device_type = self._dispatch_selector(func_name, *args, **kwargs) + + logger.debug(f"device_type {device_type}") # GPU case - if device_type == DeviceType.device: + if device_type == DeviceType.device or func_name not in ['fit', 'fit_transform', 'fit_predict']: # call the function from the GPU estimator + logger.debug(f"Performing {func_name} in GPU") return gpu_func(self, *args, **kwargs) # CPU case @@ -725,3 +774,32 @@ class UniversalBase(Base): # return function result return res + + def _dispatch_selector(self, func_name, *args, **kwargs): + """ + """ + if not hasattr(self, "_gpuaccel"): + return cuml.global_settings.device_type + + elif not self._gpuaccel: + device_type = DeviceType.host + else: + if not self._should_dispatch_cpu(func_name, *args, **kwargs): + device_type = DeviceType.device + else: + device_type = DeviceType.host + + return device_type + + def _should_dispatch_cpu(self, func_name, *args, **kwargs): + """ + This method is meant to do checks of data sizes and other things + at fit and other method call time, to decide where to disptach + a function. For hyperparameters of the estimator, + see the method _hyperparam_translator. + Each estimator inheritting from UniversalBase can override this + method to have custom rules of when to dispatch to CPU depending + on the data passed to fit/predict... + """ + + return False diff --git a/python/cuml/cuml/linear_model/elastic_net.pyx b/python/cuml/cuml/linear_model/elastic_net.pyx index a8e6b75a3d..435778adad 100644 --- a/python/cuml/cuml/linear_model/elastic_net.pyx +++ b/python/cuml/cuml/linear_model/elastic_net.pyx @@ -150,6 +150,17 @@ class ElasticNet(UniversalBase, _cpu_estimator_import_path = 'sklearn.linear_model.ElasticNet' coef_ = CumlArrayDescriptor(order='F') + _hyperparam_interop_translator = { + "positive": { + True: "dispatch", + False: "accept", + }, + "warm_start": { + True: "dispatch", + False: "accept", + }, + } + @device_interop_preparation def __init__(self, *, alpha=1.0, l1_ratio=0.5, fit_intercept=True, normalize=False, max_iter=1000, tol=1e-3, diff --git a/python/cuml/cuml/linear_model/linear_regression.pyx b/python/cuml/cuml/linear_model/linear_regression.pyx index 35a73c111f..3a53a915db 100644 --- a/python/cuml/cuml/linear_model/linear_regression.pyx +++ b/python/cuml/cuml/linear_model/linear_regression.pyx @@ -268,6 +268,13 @@ class LinearRegression(LinearPredictMixin, coef_ = CumlArrayDescriptor(order='F') intercept_ = CumlArrayDescriptor(order='F') + _hyperparam_interop_translator = { + "positive": { + True: "dispatch", + False: "accept", + }, + } + @device_interop_preparation def __init__(self, *, algorithm='eig', fit_intercept=True, copy_X=None, normalize=False, diff --git a/python/cuml/cuml/linear_model/logistic_regression.pyx b/python/cuml/cuml/linear_model/logistic_regression.pyx index aa5283fef7..c9ad443750 100644 --- a/python/cuml/cuml/linear_model/logistic_regression.pyx +++ b/python/cuml/cuml/linear_model/logistic_regression.pyx @@ -189,6 +189,17 @@ class LogisticRegression(UniversalBase, class_weight = CumlArrayDescriptor(order='F') expl_spec_weights_ = CumlArrayDescriptor(order='F') + _hyperparam_interop_translator = { + "solver": { + "lbfgs": "qn", + "liblinear": "qn", + "newton-cg": "qn", + "newton-cholesky": "qn", + "sag": "qn", + "saga": "qn" + }, + } + @device_interop_preparation def __init__( self, diff --git a/python/cuml/cuml/linear_model/ridge.pyx b/python/cuml/cuml/linear_model/ridge.pyx index ae84f1002a..daa91ae172 100644 --- a/python/cuml/cuml/linear_model/ridge.pyx +++ b/python/cuml/cuml/linear_model/ridge.pyx @@ -192,6 +192,18 @@ class Ridge(UniversalBase, coef_ = CumlArrayDescriptor(order='F') intercept_ = CumlArrayDescriptor(order='F') + _hyperparam_interop_translator = { + "solver": { + "auto": "eig", + "cholesky": "eig", + "lsqr": "eig", + "sag": "eig", + "saga": "eig", + "lbfgs": "eig", + "sparse_cg": "eig" + } + } + @device_interop_preparation def __init__(self, *, alpha=1.0, solver='eig', fit_intercept=True, normalize=False, handle=None, output_type=None, diff --git a/python/cuml/cuml/manifold/umap.pyx b/python/cuml/cuml/manifold/umap.pyx index c873461a95..7000850872 100644 --- a/python/cuml/cuml/manifold/umap.pyx +++ b/python/cuml/cuml/manifold/umap.pyx @@ -234,7 +234,7 @@ class UMAP(UniversalBase, are returned when transform is called on the same data upon which the model was trained. This enables consistent behavior between calling ``model.fit_transform(X)`` and - calling ``model.fit(X).transform(X)``. Not that the CPU-based + calling ``model.fit(X).transform(X)``. Note that the CPU-based UMAP reference implementation does this by default. This feature is made optional in the GPU version due to the significant overhead in copying memory to the host for diff --git a/python/cuml/cuml/tests/experimental/accel/estimators_hyperparams/test_dbscan.py b/python/cuml/cuml/tests/experimental/accel/estimators_hyperparams/test_dbscan.py new file mode 100644 index 0000000000..af4503d3ed --- /dev/null +++ b/python/cuml/cuml/tests/experimental/accel/estimators_hyperparams/test_dbscan.py @@ -0,0 +1,91 @@ +# +# Copyright (c) 2024, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +import pytest +import numpy as np +from sklearn.datasets import make_blobs +from sklearn.cluster import DBSCAN +from sklearn.metrics import adjusted_rand_score + + +@pytest.fixture(scope="module") +def clustering_data(): + X, y = make_blobs( + n_samples=300, + centers=3, + cluster_std=[1.0, 2.5, 0.5], + random_state=42, + ) + return X, y + + +@pytest.mark.parametrize("eps", [0.1, 0.5, 1.0, 2.0]) +def test_dbscan_eps(clustering_data, eps): + X, y_true = clustering_data + dbscan = DBSCAN(eps=eps).fit(X) + y_pred = dbscan.labels_ + ari = adjusted_rand_score(y_true, y_pred) + + +@pytest.mark.parametrize("min_samples", [1, 5, 10, 20]) +def test_dbscan_min_samples(clustering_data, min_samples): + X, y_true = clustering_data + dbscan = DBSCAN(eps=0.5, min_samples=min_samples).fit(X) + y_pred = dbscan.labels_ + ari = adjusted_rand_score(y_true, y_pred) + + +@pytest.mark.parametrize("metric", ["euclidean", "manhattan", "chebyshev"]) +def test_dbscan_metric(clustering_data, metric): + X, y_true = clustering_data + dbscan = DBSCAN(eps=0.5, metric=metric).fit(X) + y_pred = dbscan.labels_ + ari = adjusted_rand_score(y_true, y_pred) + + +@pytest.mark.parametrize( + "algorithm", ["auto", "ball_tree", "kd_tree", "brute"] +) +def test_dbscan_algorithm(clustering_data, algorithm): + X, y_true = clustering_data + dbscan = DBSCAN(eps=0.5, algorithm=algorithm).fit(X) + y_pred = dbscan.labels_ + ari = adjusted_rand_score(y_true, y_pred) + + +@pytest.mark.parametrize("leaf_size", [10, 30, 50]) +def test_dbscan_leaf_size(clustering_data, leaf_size): + X, y_true = clustering_data + dbscan = DBSCAN(eps=0.5, leaf_size=leaf_size).fit(X) + y_pred = dbscan.labels_ + ari = adjusted_rand_score(y_true, y_pred) + + +@pytest.mark.parametrize("p", [1, 2, 3]) +def test_dbscan_p(clustering_data, p): + X, y_true = clustering_data + dbscan = DBSCAN(eps=0.5, metric="minkowski", p=p).fit(X) + y_pred = dbscan.labels_ + ari = adjusted_rand_score(y_true, y_pred) + + +def test_dbscan_consistency(clustering_data): + X, y_true = clustering_data + dbscan1 = DBSCAN(eps=0.5).fit(X) + dbscan2 = DBSCAN(eps=0.5).fit(X) + assert np.array_equal( + dbscan1.labels_, dbscan2.labels_ + ), "Results should be consistent across runs" diff --git a/python/cuml/cuml/tests/experimental/accel/estimators_hyperparams/test_elastic_net.py b/python/cuml/cuml/tests/experimental/accel/estimators_hyperparams/test_elastic_net.py new file mode 100644 index 0000000000..4caf8d0022 --- /dev/null +++ b/python/cuml/cuml/tests/experimental/accel/estimators_hyperparams/test_elastic_net.py @@ -0,0 +1,215 @@ +# +# Copyright (c) 2024, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +import pytest +import numpy as np +from sklearn.datasets import make_regression +from sklearn.linear_model import ElasticNet +from sklearn.metrics import mean_squared_error, r2_score +from sklearn.preprocessing import StandardScaler + + +@pytest.fixture(scope="module") +def regression_data(): + X, y = make_regression( + n_samples=500, + n_features=20, + n_informative=10, + noise=0.1, + random_state=42, + ) + # Standardize features + X = StandardScaler().fit_transform(X) + return X, y + + +@pytest.mark.parametrize("alpha", [0.1, 0.5, 1.0, 2.0]) +def test_elasticnet_alpha(regression_data, alpha): + X, y = regression_data + model = ElasticNet(alpha=alpha, random_state=42) + model.fit(X, y) + y_pred = model.predict(X) + # Compute R^2 score + r2 = r2_score(y, y_pred) + assert r2 > 0.5, f"R^2 score should be reasonable for alpha={alpha}" + + +@pytest.mark.parametrize("l1_ratio", [0.0, 0.5, 0.7, 1.0]) +def test_elasticnet_l1_ratio(regression_data, l1_ratio): + X, y = regression_data + model = ElasticNet(alpha=1.0, l1_ratio=l1_ratio, random_state=42) + model.fit(X, y) + y_pred = model.predict(X) + # Compute R^2 score + r2 = r2_score(y, y_pred) + assert r2 > 0.5, f"R^2 score should be reasonable for l1_ratio={l1_ratio}" + # Check sparsity of coefficients when l1_ratio=1 (equivalent to Lasso) + if l1_ratio == 1.0: + num_nonzero = np.sum(model.coef_ != 0) + assert ( + num_nonzero < X.shape[1] + ), "Some coefficients should be zero when l1_ratio=1.0" + + +@pytest.mark.parametrize("max_iter", [100]) +def test_elasticnet_max_iter(regression_data, max_iter): + X, y = regression_data + model = ElasticNet(max_iter=max_iter, random_state=42) + model.fit(X, y) + + +@pytest.mark.parametrize("tol", [1e-3]) +def test_elasticnet_tol(regression_data, tol): + X, y = regression_data + model = ElasticNet(tol=tol, random_state=42) + model.fit(X, y) + y_pred = model.predict(X) + # Compute R^2 score + r2 = r2_score(y, y_pred) + assert r2 > 0.5, f"R^2 score should be reasonable for tol={tol}" + + +@pytest.mark.parametrize("fit_intercept", [True, False]) +def test_elasticnet_fit_intercept(regression_data, fit_intercept): + X, y = regression_data + model = ElasticNet(fit_intercept=fit_intercept, random_state=42) + model.fit(X, y) + y_pred = model.predict(X) + # Compute R^2 score + r2 = r2_score(y, y_pred) + assert ( + r2 > 0.5 + ), f"R^2 score should be reasonable with fit_intercept={fit_intercept}" + + +@pytest.mark.parametrize("precompute", [True, False]) +def test_elasticnet_precompute(regression_data, precompute): + X, y = regression_data + model = ElasticNet(precompute=precompute, random_state=42) + model.fit(X, y) + y_pred = model.predict(X) + # Compute R^2 score + r2 = r2_score(y, y_pred) + assert ( + r2 > 0.5 + ), f"R^2 score should be reasonable with precompute={precompute}" + + +@pytest.mark.parametrize("selection", ["cyclic", "random"]) +def test_elasticnet_selection(regression_data, selection): + X, y = regression_data + model = ElasticNet(selection=selection, random_state=42) + model.fit(X, y) + y_pred = model.predict(X) + # Compute R^2 score + r2 = r2_score(y, y_pred) + assert ( + r2 > 0.5 + ), f"R^2 score should be reasonable with selection={selection}" + + +def test_elasticnet_random_state(regression_data): + X, y = regression_data + model1 = ElasticNet(selection="random", random_state=42) + model1.fit(X, y) + model2 = ElasticNet(selection="random", random_state=42) + model2.fit(X, y) + # Coefficients should be the same when random_state is fixed + np.testing.assert_allclose( + model1.coef_, + model2.coef_, + err_msg="Coefficients should be the same with the same random_state", + ) + model3 = ElasticNet(selection="random", random_state=24) + model3.fit(X, y) + # Coefficients might differ with a different random_state + # with pytest.raises(AssertionError): + # np.testing.assert_allclose( + # model1.coef_, + # model3.coef_, + # err_msg="Coefficients should differ with different random_state", + # ) + + +def test_elasticnet_convergence_warning(regression_data): + X, y = regression_data + from sklearn.exceptions import ConvergenceWarning + + with pytest.warns(ConvergenceWarning): + model = ElasticNet(max_iter=1, random_state=42) + model.fit(X, y) + + +def test_elasticnet_coefficients(regression_data): + X, y = regression_data + model = ElasticNet(alpha=0.1, l1_ratio=0.5, random_state=42) + model.fit(X, y) + coef_nonzero = np.sum(model.coef_ != 0) + assert coef_nonzero > 0, "There should be non-zero coefficients" + + +def test_elasticnet_l1_ratio_effect(regression_data): + X, y = regression_data + model_l1 = ElasticNet(alpha=0.1, l1_ratio=1.0, random_state=42) + model_l1.fit(X, y) + model_l2 = ElasticNet(alpha=0.1, l1_ratio=0.0, random_state=42) + model_l2.fit(X, y) + num_nonzero_l1 = np.sum(model_l1.coef_ != 0) + num_nonzero_l2 = np.sum(model_l2.coef_ != 0) + assert ( + num_nonzero_l1 <= num_nonzero_l2 + ), "L1 regularization should produce sparser coefficients than L2" + + +@pytest.mark.parametrize("copy_X", [True, False]) +def test_elasticnet_copy_X(regression_data, copy_X): + X, y = regression_data + X_original = X.copy() + model = ElasticNet(copy_X=copy_X, random_state=42) + model.fit(X, y) + if copy_X: + # X should remain unchanged + assert np.allclose( + X, X_original + ), "X has been modified when copy_X=True" + else: + # X might be modified when copy_X=False + pass # We cannot guarantee X remains unchanged + + +def test_elasticnet_positive(regression_data): + X, y = regression_data + model = ElasticNet(positive=True, random_state=42) + model.fit(X, y) + # All coefficients should be non-negative + assert np.all( + model.coef_ >= 0 + ), "All coefficients should be non-negative when positive=True" + + +def test_elasticnet_warm_start(regression_data): + X, y = regression_data + model = ElasticNet(warm_start=True, random_state=42) + model.fit(X, y) + coef_old = model.coef_.copy() + # Fit again with more iterations + model.set_params(max_iter=2000) + model.fit(X, y) + coef_new = model.coef_ + # Coefficients should change after more iterations + assert not np.allclose( + coef_old, coef_new + ), "Coefficients should update when warm_start=True" diff --git a/python/cuml/cuml/tests/experimental/accel/estimators_hyperparams/test_hdbscan_core.py b/python/cuml/cuml/tests/experimental/accel/estimators_hyperparams/test_hdbscan_core.py new file mode 100644 index 0000000000..8b19f5a9d1 --- /dev/null +++ b/python/cuml/cuml/tests/experimental/accel/estimators_hyperparams/test_hdbscan_core.py @@ -0,0 +1,328 @@ +# +# Copyright (c) 2024, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +import pytest +import numpy as np +from sklearn.datasets import make_blobs, make_moons +from sklearn.preprocessing import StandardScaler +import hdbscan + + +@pytest.fixture(scope="module") +def synthetic_data(): + X, y = make_blobs( + n_samples=500, + n_features=2, + centers=5, + cluster_std=0.5, + random_state=42, + ) + # Standardize features + X = StandardScaler().fit_transform(X) + return X, y + + +@pytest.mark.parametrize("min_cluster_size", [5, 15, 30]) +def test_hdbscan_min_cluster_size(synthetic_data, min_cluster_size): + X, _ = synthetic_data + clusterer = hdbscan.HDBSCAN(min_cluster_size=min_cluster_size) + cluster_labels = clusterer.fit_predict(X) + # Check that clusters are formed + n_clusters = len(set(cluster_labels)) - (1 if -1 in cluster_labels else 0) + assert ( + n_clusters > 0 + ), f"Should find clusters with min_cluster_size={min_cluster_size}" + + +@pytest.mark.parametrize("min_samples", [1, 5, 15]) +def test_hdbscan_min_samples(synthetic_data, min_samples): + X, _ = synthetic_data + clusterer = hdbscan.HDBSCAN(min_samples=min_samples) + cluster_labels = clusterer.fit_predict(X) + # Check that clusters are formed + n_clusters = len(set(cluster_labels)) - (1 if -1 in cluster_labels else 0) + assert ( + n_clusters > 0 + ), f"Should find clusters with min_samples={min_samples}" + + +@pytest.mark.parametrize( + "metric", ["euclidean", "manhattan", "chebyshev", "minkowski"] +) +def test_hdbscan_metric(synthetic_data, metric): + X, _ = synthetic_data + p = 0.5 if metric == "minkowski" else None + clusterer = hdbscan.HDBSCAN(metric=metric, p=p) + cluster_labels = clusterer.fit_predict(X) + # Check that clusters are formed + n_clusters = len(set(cluster_labels)) - (1 if -1 in cluster_labels else 0) + assert n_clusters > 0, f"Should find clusters with metric={metric}" + + +@pytest.mark.parametrize("method", ["eom", "leaf"]) +def test_hdbscan_cluster_selection_method(synthetic_data, method): + X, _ = synthetic_data + clusterer = hdbscan.HDBSCAN(cluster_selection_method=method) + cluster_labels = clusterer.fit_predict(X) + # Check that clusters are formed + n_clusters = len(set(cluster_labels)) - (1 if -1 in cluster_labels else 0) + assert ( + n_clusters > 0 + ), f"Should find clusters with cluster_selection_method={method}" + + +def test_hdbscan_prediction_data(synthetic_data): + X, _ = synthetic_data + clusterer = hdbscan.HDBSCAN(prediction_data=True) + clusterer.fit(X) + # Check that prediction data is available + assert hasattr( + clusterer, "prediction_data_" + ), "Prediction data should be available when prediction_data=True" + + +@pytest.mark.parametrize("algorithm", ["best", "generic"]) +def test_hdbscan_algorithm(synthetic_data, algorithm): + X, _ = synthetic_data + clusterer = hdbscan.HDBSCAN(algorithm=algorithm) + cluster_labels = clusterer.fit_predict(X) + # Check that clusters are formed + n_clusters = len(set(cluster_labels)) - (1 if -1 in cluster_labels else 0) + assert n_clusters > 0, f"Should find clusters with algorithm={algorithm}" + + +@pytest.mark.parametrize("leaf_size", [10, 30, 50]) +def test_hdbscan_leaf_size(synthetic_data, leaf_size): + X, _ = synthetic_data + clusterer = hdbscan.HDBSCAN(leaf_size=leaf_size) + cluster_labels = clusterer.fit_predict(X) + # Check that clusters are formed + n_clusters = len(set(cluster_labels)) - (1 if -1 in cluster_labels else 0) + assert n_clusters > 0, f"Should find clusters with leaf_size={leaf_size}" + + +def test_hdbscan_gen_min_span_tree(synthetic_data): + X, _ = synthetic_data + clusterer = hdbscan.HDBSCAN(gen_min_span_tree=True) + clusterer.fit(X) + # Check that the minimum spanning tree is generated + assert hasattr( + clusterer, "minimum_spanning_tree_" + ), "Minimum spanning tree should be generated when gen_min_span_tree=True" + + +def test_hdbscan_memory(synthetic_data, tmpdir): + X, _ = synthetic_data + from joblib import Memory + + memory = Memory(location=tmpdir) + clusterer = hdbscan.HDBSCAN(memory=memory) + clusterer.fit(X) + # Check that cache directory is used + # assert tmpdir.listdir(), "Cache directory should not be empty when memory caching is used" + + +def test_hdbscan_approx_min_span_tree(synthetic_data): + X, _ = synthetic_data + clusterer = hdbscan.HDBSCAN(approx_min_span_tree=True) + clusterer.fit(X) + # Check that the parameter is set correctly + assert ( + clusterer.approx_min_span_tree is True + ), "approx_min_span_tree should be set to True" + + +@pytest.mark.parametrize("n_jobs", [1, -1]) +def test_hdbscan_core_dist_n_jobs(synthetic_data, n_jobs): + X, _ = synthetic_data + clusterer = hdbscan.HDBSCAN(core_dist_n_jobs=n_jobs) + clusterer.fit(X) + # We assume the code runs without error; no direct way to test n_jobs effect + assert True, f"HDBSCAN ran successfully with core_dist_n_jobs={n_jobs}" + + +def test_hdbscan_probabilities(synthetic_data): + X, _ = synthetic_data + clusterer = hdbscan.HDBSCAN() + clusterer.fit(X) + # Check that cluster membership probabilities are available + assert hasattr( + clusterer, "probabilities_" + ), "Cluster membership probabilities should be available after fitting" + + +def test_hdbscan_outlier_scores(synthetic_data): + X, _ = synthetic_data + clusterer = hdbscan.HDBSCAN() + clusterer.fit(X) + # Check that outlier scores are available + assert hasattr( + clusterer, "outlier_scores_" + ), "Outlier scores should be available after fitting" + + +def test_hdbscan_fit_predict(synthetic_data): + X, _ = synthetic_data + clusterer = hdbscan.HDBSCAN() + labels_fit = clusterer.fit(X).labels_ + labels_predict = clusterer.fit_predict(X) + # Check that labels from fit and fit_predict are the same + assert np.array_equal( + labels_fit, labels_predict + ), "Labels from fit and fit_predict should be the same" + + +def test_hdbscan_invalid_metric(synthetic_data): + X, _ = synthetic_data + with pytest.raises(ValueError): + clusterer = hdbscan.HDBSCAN(metric="invalid_metric") + clusterer.fit(X) + + +def test_hdbscan_sparse_input(): + from scipy.sparse import csr_matrix + + X, _ = make_blobs( + n_samples=100, + n_features=2, + centers=3, + cluster_std=0.5, + random_state=42, + ) + X_sparse = csr_matrix(X) + clusterer = hdbscan.HDBSCAN() + cluster_labels = clusterer.fit_predict(X_sparse) + # Check that clusters are formed + n_clusters = len(set(cluster_labels)) - (1 if -1 in cluster_labels else 0) + assert n_clusters > 0, "Should find clusters with sparse input data" + + +def test_hdbscan_non_convex_shapes(): + X, y = make_moons(n_samples=300, noise=0.05, random_state=42) + clusterer = hdbscan.HDBSCAN(min_cluster_size=5) + cluster_labels = clusterer.fit_predict(X) + # Check that at least two clusters are found + n_clusters = len(set(cluster_labels)) - (1 if -1 in cluster_labels else 0) + assert n_clusters >= 2, "Should find clusters in non-convex shapes" + + +def test_hdbscan_prediction(synthetic_data): + X_train, _ = synthetic_data + X_test, _ = make_blobs( + n_samples=100, + n_features=2, + centers=5, + cluster_std=0.5, + random_state=24, + ) + X_test = StandardScaler().fit_transform(X_test) + clusterer = hdbscan.HDBSCAN(prediction_data=True) + clusterer.fit(X_train) + test_labels, strengths = hdbscan.approximate_predict(clusterer, X_test) + # Check that labels are assigned to test data + assert ( + len(test_labels) == X_test.shape[0] + ), "Labels should be assigned to test data points" + + +def test_hdbscan_single_linkage_tree(synthetic_data): + X, _ = synthetic_data + clusterer = hdbscan.HDBSCAN(gen_min_span_tree=True) + clusterer.fit(X) + # Check that the single linkage tree is generated + assert hasattr( + clusterer, "single_linkage_tree_" + ), "Single linkage tree should be generated after fitting" + + +def test_hdbscan_condensed_tree(synthetic_data): + X, _ = synthetic_data + clusterer = hdbscan.HDBSCAN() + clusterer.fit(X) + # Check that the condensed tree is available + assert hasattr( + clusterer, "condensed_tree_" + ), "Condensed tree should be available after fitting" + + +def test_hdbscan_exemplars(synthetic_data): + X, _ = synthetic_data + clusterer = hdbscan.HDBSCAN() + clusterer.fit(X) + # Check that cluster exemplars are available + assert hasattr( + clusterer, "exemplars_" + ), "Cluster exemplars should be available after fitting" + + +def test_hdbscan_prediction_data_with_prediction(synthetic_data): + X_train, _ = synthetic_data + clusterer = hdbscan.HDBSCAN(prediction_data=True) + clusterer.fit(X_train) + # Use training data for prediction as a simple test + test_labels, strengths = hdbscan.approximate_predict(clusterer, X_train) + # Check that labels from prediction match original labels + assert np.array_equal( + clusterer.labels_, test_labels + ), "Predicted labels should match original labels for training data" + + +def test_hdbscan_predict_without_prediction_data(synthetic_data): + X_train, _ = synthetic_data + clusterer = hdbscan.HDBSCAN(prediction_data=False) + clusterer.fit(X_train) + with pytest.raises(AttributeError): + hdbscan.approximate_predict(clusterer, X_train) + + +def test_hdbscan_min_cluster_size_effect(synthetic_data): + X, _ = synthetic_data + min_cluster_sizes = [5, 15, 30, 50] + n_clusters_list = [] + for size in min_cluster_sizes: + clusterer = hdbscan.HDBSCAN(min_cluster_size=size) + cluster_labels = clusterer.fit_predict(X) + n_clusters = len(set(cluster_labels)) - ( + 1 if -1 in cluster_labels else 0 + ) + n_clusters_list.append(n_clusters) + # Expect fewer clusters as min_cluster_size increases + assert n_clusters_list == sorted( + n_clusters_list, reverse=True + ), "Number of clusters should decrease as min_cluster_size increases" + + +def test_hdbscan_min_span_tree_effect(synthetic_data): + X, _ = synthetic_data + clusterer_with_tree = hdbscan.HDBSCAN(gen_min_span_tree=True) + clusterer_with_tree.fit(X) + clusterer_without_tree = hdbscan.HDBSCAN(gen_min_span_tree=False) + clusterer_without_tree.fit(X) + # Check that the minimum spanning tree affects the clustering (may not always be true) + assert np.array_equal( + clusterer_with_tree.labels_, clusterer_without_tree.labels_ + ), "Clustering should be consistent regardless of gen_min_span_tree" + + +def test_hdbscan_allow_single_cluster(synthetic_data): + X, _ = synthetic_data + clusterer = hdbscan.HDBSCAN(allow_single_cluster=True) + cluster_labels = clusterer.fit_predict(X) + # Check that clusters are formed + n_clusters = len(set(cluster_labels)) - (1 if -1 in cluster_labels else 0) + assert ( + n_clusters >= 1 + ), "Should allow a single cluster when allow_single_cluster=True" diff --git a/python/cuml/cuml/tests/experimental/accel/estimators_hyperparams/test_hdbscan_extended.py b/python/cuml/cuml/tests/experimental/accel/estimators_hyperparams/test_hdbscan_extended.py new file mode 100644 index 0000000000..1e590ff798 --- /dev/null +++ b/python/cuml/cuml/tests/experimental/accel/estimators_hyperparams/test_hdbscan_extended.py @@ -0,0 +1,214 @@ +# +# Copyright (c) 2024, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + + +import pytest +import numpy as np +from sklearn.datasets import make_blobs, make_moons +from sklearn.preprocessing import StandardScaler +import hdbscan +from hdbscan import validity +from hdbscan import prediction + + +@pytest.fixture(scope="module") +def synthetic_data(): + X, y = make_blobs( + n_samples=500, + n_features=2, + centers=5, + cluster_std=0.5, + random_state=42, + ) + # Standardize features + X = StandardScaler().fit_transform(X) + return X, y + + +def test_hdbscan_approximate_predict(synthetic_data): + X_train, _ = synthetic_data + X_test, _ = make_blobs( + n_samples=100, + n_features=2, + centers=5, + cluster_std=0.5, + random_state=24, + ) + X_test = StandardScaler().fit_transform(X_test) + clusterer = hdbscan.HDBSCAN(prediction_data=True) + clusterer.fit(X_train) + test_labels, strengths = hdbscan.approximate_predict(clusterer, X_test) + # Check that labels are assigned to test data + assert ( + len(test_labels) == X_test.shape[0] + ), "Labels should be assigned to test data points" + assert ( + len(strengths) == X_test.shape[0] + ), "Strengths should be computed for test data points" + # Check that strengths are between 0 and 1 + assert np.all( + (strengths >= 0) & (strengths <= 1) + ), "Strengths should be between 0 and 1" + + +def test_hdbscan_membership_vector(synthetic_data): + X_train, _ = synthetic_data + clusterer = hdbscan.HDBSCAN(prediction_data=True) + clusterer.fit(X_train) + point = X_train[0].reshape((1, 2)) + membership = hdbscan.membership_vector(clusterer, point) + + +def test_hdbscan_all_points_membership_vectors(synthetic_data): + X_train, _ = synthetic_data + clusterer = hdbscan.HDBSCAN(prediction_data=True) + clusterer.fit(X_train) + memberships = hdbscan.all_points_membership_vectors(clusterer) + # Check that the number of membership vectors matches the number of samples + assert ( + len(memberships) == X_train.shape[0] + ), "There should be a membership vector for each sample" + # Check that each membership vector sums to 1 + for membership in memberships: + # Check that all probabilities are between 0 and 1 + assert all( + 0.0 <= v <= 1.0 for v in membership + ), "Probabilities should be between 0 and 1" + + +def test_hdbscan_validity_index(synthetic_data): + X, _ = synthetic_data + clusterer = hdbscan.HDBSCAN() + clusterer.fit(X) + score = validity.validity_index(X, clusterer.labels_, metric="euclidean") + # Check that the validity index is a finite number + assert np.isfinite(score), "Validity index should be a finite number" + + +def test_hdbscan_condensed_tree(synthetic_data): + X, _ = synthetic_data + clusterer = hdbscan.HDBSCAN() + clusterer.fit(X) + condensed_tree = clusterer.condensed_tree_ + # Check that the condensed tree has the expected attributes + assert hasattr( + condensed_tree, "to_pandas" + ), "Condensed tree should have a 'to_pandas' method" + # Convert to pandas DataFrame and check columns + df = condensed_tree.to_pandas() + + +def test_hdbscan_single_linkage_tree_attribute(synthetic_data): + X, _ = synthetic_data + clusterer = hdbscan.HDBSCAN() + clusterer.fit(X) + single_linkage_tree = clusterer.single_linkage_tree_ + # Check that the single linkage tree has the expected attributes + assert hasattr( + single_linkage_tree, "to_numpy" + ), "Single linkage tree should have a 'to_numpy' method" + # Convert to NumPy array and check shape + sl_tree_array = single_linkage_tree.to_numpy() + assert ( + sl_tree_array.shape[1] == 4 + ), "Single linkage tree array should have 4 columns" + + +def test_hdbscan_flat_clustering(synthetic_data): + X, _ = synthetic_data + clusterer = hdbscan.HDBSCAN() + clusterer.fit(X) + # Extract clusters at a specific cluster_selection_epsilon + clusterer_flat = hdbscan.HDBSCAN(cluster_selection_epsilon=0.1) + clusterer_flat.fit(X) + # Check that clusters are formed + n_clusters_flat = len(set(clusterer_flat.labels_)) - ( + 1 if -1 in clusterer_flat.labels_ else 0 + ) + assert n_clusters_flat > 0, "Should find clusters with flat clustering" + + +def test_hdbscan_prediction_membership_vector(synthetic_data): + X_train, _ = synthetic_data + clusterer = hdbscan.HDBSCAN(prediction_data=True) + clusterer.fit(X_train) + point = X_train[0].reshape((1, 2)) + membership = prediction.membership_vector(clusterer, point) + + +def test_hdbscan_prediction_all_points_membership_vectors(synthetic_data): + X_train, _ = synthetic_data + clusterer = hdbscan.HDBSCAN(prediction_data=True) + clusterer.fit(X_train) + memberships = prediction.all_points_membership_vectors(clusterer) + # Check that the number of membership vectors matches the number of samples + assert ( + len(memberships) == X_train.shape[0] + ), "There should be a membership vector for each sample" + for membership in memberships: + # Check that all probabilities are between 0 and 1 + assert all( + 0.0 <= v <= 1.0 for v in membership + ), "Probabilities should be between 0 and 1" + + +def test_hdbscan_outlier_exposure(synthetic_data): + # Note: hdbscan may not have a function named 'outlier_exposure' + # This is a placeholder for any outlier detection functionality + X, _ = synthetic_data + clusterer = hdbscan.HDBSCAN() + clusterer.fit(X) + # Check if outlier scores are computed + if hasattr(clusterer, "outlier_scores_"): + outlier_scores = clusterer.outlier_scores_ + # Check that outlier scores are finite numbers + assert np.all( + np.isfinite(outlier_scores) + ), "Outlier scores should be finite numbers" + else: + pytest.skip( + "Outlier exposure functionality is not available in this version of HDBSCAN" + ) + + +# test requires networkx +# def test_hdbscan_extract_single_linkage_tree(synthetic_data): +# X, _ = synthetic_data +# clusterer = hdbscan.HDBSCAN() +# clusterer.fit(X) +# # Extract the single linkage tree +# sl_tree = clusterer.single_linkage_tree_.to_networkx() +# # Check that the tree has the correct number of nodes +# assert sl_tree.number_of_nodes() == X.shape[0], "Single linkage tree should have a node for each data point" + + +def test_hdbscan_get_exemplars(synthetic_data): + X, _ = synthetic_data + clusterer = hdbscan.HDBSCAN() + clusterer.fit(X) + if hasattr(clusterer, "exemplars_"): + exemplars = clusterer.exemplars_ + # Check that exemplars are available for each cluster + n_clusters = len(set(clusterer.labels_)) - ( + 1 if -1 in clusterer.labels_ else 0 + ) + assert ( + len(exemplars) == n_clusters + ), "There should be exemplars for each cluster" + else: + pytest.skip( + "Exemplar functionality is not available in this version of HDBSCAN" + ) diff --git a/python/cuml/cuml/tests/experimental/accel/estimators_hyperparams/test_kmeans.py b/python/cuml/cuml/tests/experimental/accel/estimators_hyperparams/test_kmeans.py new file mode 100644 index 0000000000..7ea1b22202 --- /dev/null +++ b/python/cuml/cuml/tests/experimental/accel/estimators_hyperparams/test_kmeans.py @@ -0,0 +1,105 @@ +# +# Copyright (c) 2024, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +import pytest +import numpy as np +from sklearn.datasets import make_blobs +from sklearn.cluster import KMeans +from sklearn.metrics import adjusted_rand_score + + +@pytest.fixture(scope="module") +def clustering_data(): + X, y = make_blobs( + n_samples=300, centers=3, cluster_std=1.0, random_state=42 + ) + return X, y + + +@pytest.mark.parametrize("n_clusters", [2, 3, 4, 5]) +def test_kmeans_n_clusters(clustering_data, n_clusters): + X, y_true = clustering_data + kmeans = KMeans(n_clusters=n_clusters, random_state=42).fit(X) + y_pred = kmeans.labels_ + ari = adjusted_rand_score(y_true, y_pred) + + +@pytest.mark.parametrize("init", ["k-means++", "random"]) +def test_kmeans_init(clustering_data, init): + X, y_true = clustering_data + kmeans = KMeans(n_clusters=3, init=init, random_state=42).fit(X) + y_pred = kmeans.labels_ + ari = adjusted_rand_score(y_true, y_pred) + + +@pytest.mark.parametrize("n_init", [1, 5, 10, 20]) +def test_kmeans_n_init(clustering_data, n_init): + X, y_true = clustering_data + kmeans = KMeans(n_clusters=3, n_init=n_init, random_state=42).fit(X) + y_pred = kmeans.labels_ + ari = adjusted_rand_score(y_true, y_pred) + + +@pytest.mark.parametrize("max_iter", [100, 300, 500]) +def test_kmeans_max_iter(clustering_data, max_iter): + X, y_true = clustering_data + kmeans = KMeans(n_clusters=3, max_iter=max_iter, random_state=42).fit(X) + y_pred = kmeans.labels_ + ari = adjusted_rand_score(y_true, y_pred) + + +@pytest.mark.parametrize("tol", [1e-4, 1e-3, 1e-2]) +def test_kmeans_tol(clustering_data, tol): + X, y_true = clustering_data + kmeans = KMeans(n_clusters=3, tol=tol, random_state=42).fit(X) + y_pred = kmeans.labels_ + ari = adjusted_rand_score(y_true, y_pred) + + +@pytest.mark.parametrize("algorithm", ["elkan", "lloyd"]) +def test_kmeans_algorithm(clustering_data, algorithm): + X, y_true = clustering_data + kmeans = KMeans(n_clusters=3, algorithm=algorithm, random_state=42).fit(X) + y_pred = kmeans.labels_ + ari = adjusted_rand_score(y_true, y_pred) + + +@pytest.mark.parametrize("copy_x", [True, False]) +def test_kmeans_copy_x(clustering_data, copy_x): + X, y_true = clustering_data + X_original = X.copy() + kmeans = KMeans(n_clusters=3, copy_x=copy_x, random_state=42).fit(X) + if copy_x: + # X should remain unchanged + assert np.allclose( + X, X_original + ), "X has been modified when copy_x=True" + else: + # X might be modified when copy_x=False + pass # We cannot guarantee X remains unchanged + y_pred = kmeans.labels_ + ari = adjusted_rand_score(y_true, y_pred) + + +def test_kmeans_random_state(clustering_data): + X, y_true = clustering_data + kmeans1 = KMeans(n_clusters=3, random_state=42).fit(X) + kmeans2 = KMeans(n_clusters=3, random_state=42).fit(X) + # With the same random_state, results should be the same + assert np.allclose(kmeans1.cluster_centers_, kmeans2.cluster_centers_) + kmeans3 = KMeans(n_clusters=3, random_state=24).fit(X) + # With different random_state, results might differ + assert not np.allclose(kmeans1.cluster_centers_, kmeans3.cluster_centers_) diff --git a/python/cuml/cuml/tests/experimental/accel/estimators_hyperparams/test_kneighbors_classifier.py b/python/cuml/cuml/tests/experimental/accel/estimators_hyperparams/test_kneighbors_classifier.py new file mode 100644 index 0000000000..8776754dd6 --- /dev/null +++ b/python/cuml/cuml/tests/experimental/accel/estimators_hyperparams/test_kneighbors_classifier.py @@ -0,0 +1,194 @@ +# +# Copyright (c) 2024, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + + +import pytest +import numpy as np +from sklearn.datasets import make_classification +from sklearn.neighbors import KNeighborsClassifier +from sklearn.preprocessing import StandardScaler +from sklearn.metrics import accuracy_score + + +@pytest.fixture(scope="module") +def classification_data(): + X, y = make_classification( + n_samples=500, + n_features=20, + n_informative=15, + n_redundant=5, + n_classes=3, + random_state=42, + ) + # Standardize features + X = StandardScaler().fit_transform(X) + return X, y + + +@pytest.mark.parametrize("n_neighbors", [1, 3, 5, 10]) +def test_knn_classifier_n_neighbors(classification_data, n_neighbors): + X, y = classification_data + model = KNeighborsClassifier(n_neighbors=n_neighbors) + model.fit(X, y) + y_pred = model.predict(X) + acc = accuracy_score(y, y_pred) + assert ( + acc > 0.7 + ), f"Accuracy should be reasonable with n_neighbors={n_neighbors}" + + +@pytest.mark.parametrize("weights", ["uniform", "distance"]) +def test_knn_classifier_weights(classification_data, weights): + X, y = classification_data + model = KNeighborsClassifier(weights=weights) + model.fit(X, y) + y_pred = model.predict(X) + acc = accuracy_score(y, y_pred) + assert acc > 0.7, f"Accuracy should be reasonable with weights={weights}" + + +@pytest.mark.parametrize( + "algorithm", ["auto", "ball_tree", "kd_tree", "brute"] +) +def test_knn_classifier_algorithm(classification_data, algorithm): + X, y = classification_data + model = KNeighborsClassifier(algorithm=algorithm) + model.fit(X, y) + y_pred = model.predict(X) + acc = accuracy_score(y, y_pred) + assert ( + acc > 0.7 + ), f"Accuracy should be reasonable with algorithm={algorithm}" + + +@pytest.mark.parametrize("leaf_size", [10, 30, 50]) +def test_knn_classifier_leaf_size(classification_data, leaf_size): + X, y = classification_data + model = KNeighborsClassifier(leaf_size=leaf_size) + model.fit(X, y) + y_pred = model.predict(X) + acc = accuracy_score(y, y_pred) + assert ( + acc > 0.7 + ), f"Accuracy should be reasonable with leaf_size={leaf_size}" + + +@pytest.mark.parametrize( + "metric", ["euclidean", "manhattan", "chebyshev", "minkowski"] +) +def test_knn_classifier_metric(classification_data, metric): + X, y = classification_data + model = KNeighborsClassifier(metric=metric) + model.fit(X, y) + y_pred = model.predict(X) + acc = accuracy_score(y, y_pred) + assert acc > 0.7, f"Accuracy should be reasonable with metric={metric}" + + +@pytest.mark.parametrize("p", [1, 2, 3]) +def test_knn_classifier_p_parameter(classification_data, p): + X, y = classification_data + model = KNeighborsClassifier(metric="minkowski", p=p) + model.fit(X, y) + y_pred = model.predict(X) + acc = accuracy_score(y, y_pred) + assert acc > 0.7, f"Accuracy should be reasonable with p={p}" + + +def test_knn_classifier_weights_callable(classification_data): + X, y = classification_data + + def custom_weights(distances): + return np.ones_like(distances) + + model = KNeighborsClassifier(weights=custom_weights) + model.fit(X, y) + y_pred = model.predict(X) + acc = accuracy_score(y, y_pred) + assert acc > 0.7, "Accuracy should be reasonable with custom weights" + + +def test_knn_classifier_invalid_algorithm(classification_data): + X, y = classification_data + with pytest.raises(ValueError): + model = KNeighborsClassifier(algorithm="invalid_algorithm") + model.fit(X, y) + + +def test_knn_classifier_invalid_metric(classification_data): + X, y = classification_data + with pytest.raises(ValueError): + model = KNeighborsClassifier(metric="invalid_metric") + model.fit(X, y) + + +def test_knn_classifier_invalid_weights(classification_data): + X, y = classification_data + with pytest.raises(ValueError): + model = KNeighborsClassifier(weights="invalid_weight") + model.fit(X, y) + + +def test_knn_classifier_predict_proba(classification_data): + X, y = classification_data + model = KNeighborsClassifier() + model.fit(X, y) + proba = model.predict_proba(X) + # Check that probabilities sum to 1 + assert np.allclose(proba.sum(axis=1), 1), "Probabilities should sum to 1" + # Check shape + assert proba.shape == ( + X.shape[0], + len(np.unique(y)), + ), "Probability matrix shape should be (n_samples, n_classes)" + + +def test_knn_classifier_no_data(): + with pytest.raises(ValueError): + model = KNeighborsClassifier() + model.fit(None, None) + + +def test_knn_classifier_sparse_input(): + from scipy.sparse import csr_matrix + + X, y = make_classification(n_samples=100, n_features=20, random_state=42) + X_sparse = csr_matrix(X) + model = KNeighborsClassifier() + model.fit(X_sparse, y) + y_pred = model.predict(X_sparse) + acc = accuracy_score(y, y_pred) + assert acc > 0.7, "Accuracy should be reasonable with sparse input" + + +def test_knn_classifier_multilabel(): + from sklearn.datasets import make_multilabel_classification + + X, y = make_multilabel_classification( + n_samples=100, n_features=20, n_classes=3, random_state=42 + ) + model = KNeighborsClassifier() + model.fit(X, y) + y_pred = model.predict(X) + # Check that the predicted shape matches the true labels + assert ( + y_pred.shape == y.shape + ), "Predicted labels should have the same shape as true labels" + # Calculate accuracy for multi-label + acc = (y_pred == y).mean() + assert ( + acc > 0.7 + ), "Accuracy should be reasonable for multi-label classification" diff --git a/python/cuml/cuml/tests/experimental/accel/estimators_hyperparams/test_kneighbors_regressor.py b/python/cuml/cuml/tests/experimental/accel/estimators_hyperparams/test_kneighbors_regressor.py new file mode 100644 index 0000000000..bd1b025a42 --- /dev/null +++ b/python/cuml/cuml/tests/experimental/accel/estimators_hyperparams/test_kneighbors_regressor.py @@ -0,0 +1,168 @@ +# +# Copyright (c) 2024, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +import pytest +import numpy as np +from sklearn.datasets import make_regression +from sklearn.neighbors import KNeighborsRegressor +from sklearn.preprocessing import StandardScaler +from sklearn.metrics import r2_score + + +@pytest.fixture(scope="module") +def regression_data(): + X, y = make_regression( + n_samples=500, + n_features=20, + n_informative=15, + noise=0.1, + random_state=42, + ) + # Standardize features + X = StandardScaler().fit_transform(X) + return X, y + + +@pytest.mark.parametrize("n_neighbors", [1, 3, 5, 10]) +def test_knn_regressor_n_neighbors(regression_data, n_neighbors): + X, y = regression_data + model = KNeighborsRegressor(n_neighbors=n_neighbors) + model.fit(X, y) + y_pred = model.predict(X) + r2 = r2_score(y, y_pred) + + +@pytest.mark.parametrize("weights", ["uniform", "distance"]) +def test_knn_regressor_weights(regression_data, weights): + X, y = regression_data + model = KNeighborsRegressor(weights=weights) + model.fit(X, y) + y_pred = model.predict(X) + r2 = r2_score(y, y_pred) + assert r2 > 0.7, f"R^2 score should be reasonable with weights={weights}" + + +@pytest.mark.parametrize( + "algorithm", ["auto", "ball_tree", "kd_tree", "brute"] +) +def test_knn_regressor_algorithm(regression_data, algorithm): + X, y = regression_data + model = KNeighborsRegressor(algorithm=algorithm) + model.fit(X, y) + y_pred = model.predict(X) + r2 = r2_score(y, y_pred) + assert ( + r2 > 0.7 + ), f"R^2 score should be reasonable with algorithm={algorithm}" + + +@pytest.mark.parametrize("leaf_size", [10, 30, 50]) +def test_knn_regressor_leaf_size(regression_data, leaf_size): + X, y = regression_data + model = KNeighborsRegressor(leaf_size=leaf_size) + model.fit(X, y) + y_pred = model.predict(X) + r2 = r2_score(y, y_pred) + assert ( + r2 > 0.7 + ), f"R^2 score should be reasonable with leaf_size={leaf_size}" + + +@pytest.mark.parametrize( + "metric", ["euclidean", "manhattan", "chebyshev", "minkowski"] +) +def test_knn_regressor_metric(regression_data, metric): + X, y = regression_data + model = KNeighborsRegressor(metric=metric) + model.fit(X, y) + y_pred = model.predict(X) + r2 = r2_score(y, y_pred) + + +@pytest.mark.parametrize("p", [1, 2, 3]) +def test_knn_regressor_p_parameter(regression_data, p): + X, y = regression_data + model = KNeighborsRegressor(metric="minkowski", p=p) + model.fit(X, y) + y_pred = model.predict(X) + r2 = r2_score(y, y_pred) + assert r2 > 0.7, f"R^2 score should be reasonable with p={p}" + + +def test_knn_regressor_weights_callable(regression_data): + X, y = regression_data + + def custom_weights(distances): + return np.ones_like(distances) + + model = KNeighborsRegressor(weights=custom_weights) + model.fit(X, y) + y_pred = model.predict(X) + r2 = r2_score(y, y_pred) + assert r2 > 0.7, "R^2 score should be reasonable with custom weights" + + +def test_knn_regressor_invalid_algorithm(regression_data): + X, y = regression_data + with pytest.raises(ValueError): + model = KNeighborsRegressor(algorithm="invalid_algorithm") + model.fit(X, y) + + +def test_knn_regressor_invalid_metric(regression_data): + X, y = regression_data + with pytest.raises(ValueError): + model = KNeighborsRegressor(metric="invalid_metric") + model.fit(X, y) + + +def test_knn_regressor_invalid_weights(regression_data): + X, y = regression_data + with pytest.raises(ValueError): + model = KNeighborsRegressor(weights="invalid_weight") + model.fit(X, y) + + +def test_knn_regressor_no_data(): + with pytest.raises(ValueError): + model = KNeighborsRegressor() + model.fit(None, None) + + +def test_knn_regressor_sparse_input(): + from scipy.sparse import csr_matrix + + X, y = make_regression(n_samples=100, n_features=20, random_state=42) + X_sparse = csr_matrix(X) + model = KNeighborsRegressor() + model.fit(X_sparse, y) + y_pred = model.predict(X_sparse) + r2 = r2_score(y, y_pred) + + +def test_knn_regressor_multioutput(): + X, y = make_regression( + n_samples=100, n_features=20, n_targets=3, random_state=42 + ) + model = KNeighborsRegressor() + model.fit(X, y) + y_pred = model.predict(X) + # Check that the predicted shape matches the true targets + assert ( + y_pred.shape == y.shape + ), "Predicted outputs should have the same shape as true outputs" + # Calculate R^2 score for multi-output regression + r2 = r2_score(y, y_pred) diff --git a/python/cuml/cuml/tests/experimental/accel/estimators_hyperparams/test_lasso.py b/python/cuml/cuml/tests/experimental/accel/estimators_hyperparams/test_lasso.py new file mode 100644 index 0000000000..ff9e620429 --- /dev/null +++ b/python/cuml/cuml/tests/experimental/accel/estimators_hyperparams/test_lasso.py @@ -0,0 +1,199 @@ +# +# Copyright (c) 2024, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +import pytest +import numpy as np +from sklearn.datasets import make_regression +from sklearn.linear_model import Lasso +from sklearn.metrics import r2_score +from sklearn.preprocessing import StandardScaler + + +@pytest.fixture(scope="module") +def regression_data(): + X, y, coef = make_regression( + n_samples=500, + n_features=20, + n_informative=10, + noise=0.1, + coef=True, + random_state=42, + ) + # Standardize features + X = StandardScaler().fit_transform(X) + return X, y, coef + + +@pytest.mark.parametrize("alpha", [0.1, 1.0, 10.0, 100.0]) +def test_lasso_alpha(regression_data, alpha): + X, y, _ = regression_data + model = Lasso(alpha=alpha, random_state=42) + model.fit(X, y) + y_pred = model.predict(X) + # Compute R^2 score + r2 = r2_score(y, y_pred) + + +def test_lasso_alpha_sparsity(regression_data): + X, y, _ = regression_data + alphas = [0.1, 1.0, 10.0, 100.0] + zero_counts = [] + for alpha in alphas: + model = Lasso(alpha=alpha, random_state=42) + model.fit(X, y) + zero_counts.append(np.sum(model.coef_ == 0)) + # Check that zero_counts increases with alpha + assert zero_counts == sorted( + zero_counts + ), "Number of zero coefficients should increase with alpha" + + +@pytest.mark.parametrize("max_iter", [100]) +def test_lasso_max_iter(regression_data, max_iter): + X, y, _ = regression_data + model = Lasso(max_iter=max_iter, random_state=42) + model.fit(X, y) + + +@pytest.mark.parametrize("tol", [1e-3]) +def test_lasso_tol(regression_data, tol): + X, y, _ = regression_data + model = Lasso(tol=tol, random_state=42) + model.fit(X, y) + y_pred = model.predict(X) + # Compute R^2 score + r2 = r2_score(y, y_pred) + assert r2 > 0.5, f"R^2 score should be reasonable for tol={tol}" + + +@pytest.mark.parametrize("fit_intercept", [True, False]) +def test_lasso_fit_intercept(regression_data, fit_intercept): + X, y, _ = regression_data + model = Lasso(fit_intercept=fit_intercept, random_state=42) + model.fit(X, y) + y_pred = model.predict(X) + # Compute R^2 score + r2 = r2_score(y, y_pred) + assert ( + r2 > 0.5 + ), f"R^2 score should be reasonable with fit_intercept={fit_intercept}" + + +def test_lasso_positive(regression_data): + X, y, _ = regression_data + model = Lasso(positive=True, random_state=42) + model.fit(X, y) + # All coefficients should be non-negative + assert np.all( + model.coef_ >= 0 + ), "All coefficients should be non-negative when positive=True" + + +def test_lasso_random_state(regression_data): + X, y, _ = regression_data + model1 = Lasso(selection="random", random_state=42) + model1.fit(X, y) + model2 = Lasso(selection="random", random_state=42) + model2.fit(X, y) + # Coefficients should be the same when random_state is fixed + np.testing.assert_allclose( + model1.coef_, + model2.coef_, + err_msg="Coefficients should be the same with the same random_state", + ) + model3 = Lasso(selection="random", random_state=24) + model3.fit(X, y) + # Coefficients might differ with a different random_state + with pytest.raises(AssertionError): + np.testing.assert_allclose( + model1.coef_, + model3.coef_, + err_msg="Coefficients should differ with different random_state", + ) + + +def test_lasso_warm_start(regression_data): + X, y, _ = regression_data + model = Lasso(warm_start=True, random_state=42) + model.fit(X, y) + coef_old = model.coef_.copy() + # Fit again with different alpha + model.set_params(alpha=10.0) + model.fit(X, y) + coef_new = model.coef_ + # Coefficients should change after refitting with a different alpha + assert not np.allclose( + coef_old, coef_new + ), "Coefficients should update when warm_start=True" + + +@pytest.mark.parametrize("copy_X", [True, False]) +def test_lasso_copy_X(regression_data, copy_X): + X, y, _ = regression_data + X_original = X.copy() + model = Lasso(copy_X=copy_X, random_state=42) + model.fit(X, y) + if copy_X: + # X should remain unchanged + assert np.allclose( + X, X_original + ), "X has been modified when copy_X=True" + else: + # X might be modified when copy_X=False + pass # We cannot guarantee X remains unchanged + + +def test_lasso_convergence_warning(regression_data): + X, y, _ = regression_data + from sklearn.exceptions import ConvergenceWarning + + with pytest.warns(ConvergenceWarning): + model = Lasso(max_iter=1, random_state=42) + model.fit(X, y) + + +def test_lasso_coefficients_sparsity(regression_data): + X, y, _ = regression_data + model = Lasso(alpha=1.0, random_state=42) + model.fit(X, y) + coef_zero = np.sum(model.coef_ == 0) + assert ( + coef_zero > 0 + ), "There should be zero coefficients indicating sparsity" + + +@pytest.mark.parametrize("selection", ["cyclic", "random"]) +def test_lasso_selection(regression_data, selection): + X, y, _ = regression_data + model = Lasso(selection=selection, random_state=42) + model.fit(X, y) + y_pred = model.predict(X) + r2 = r2_score(y, y_pred) + assert ( + r2 > 0.5 + ), f"R^2 score should be reasonable with selection={selection}" + + +@pytest.mark.parametrize("precompute", [True, False]) +def test_lasso_precompute(regression_data, precompute): + X, y, _ = regression_data + model = Lasso(precompute=precompute, random_state=42) + model.fit(X, y) + y_pred = model.predict(X) + r2 = r2_score(y, y_pred) + assert ( + r2 > 0.5 + ), f"R^2 score should be reasonable with precompute={precompute}" diff --git a/python/cuml/cuml/tests/experimental/accel/estimators_hyperparams/test_linear_regression.py b/python/cuml/cuml/tests/experimental/accel/estimators_hyperparams/test_linear_regression.py new file mode 100644 index 0000000000..34bc2c0358 --- /dev/null +++ b/python/cuml/cuml/tests/experimental/accel/estimators_hyperparams/test_linear_regression.py @@ -0,0 +1,59 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pytest +import numpy as np +from sklearn.datasets import make_regression +from sklearn.linear_model import LinearRegression +from sklearn.metrics import r2_score + + +@pytest.fixture(scope="module") +def regression_data(): + X, y = make_regression( + n_samples=100, n_features=20, noise=0.1, random_state=42 + ) + return X, y + + +@pytest.mark.parametrize("fit_intercept", [True, False]) +def test_linear_regression_fit_intercept(regression_data, fit_intercept): + X, y = regression_data + lr = LinearRegression(fit_intercept=fit_intercept).fit(X, y) + y_pred = lr.predict(X) + + +@pytest.mark.parametrize("copy_X", [True, False]) +def test_linear_regression_copy_X(regression_data, copy_X): + X, y = regression_data + X_original = X.copy() + lr = LinearRegression(copy_X=copy_X).fit(X, y) + if copy_X: + # X should remain unchanged + assert np.array_equal( + X, X_original + ), "X has been modified when copy_X=True" + else: + # X might be modified when copy_X=False + pass # We cannot guarantee X remains unchanged + + +@pytest.mark.parametrize("positive", [True, False]) +def test_linear_regression_positive(regression_data, positive): + X, y = regression_data + lr = LinearRegression(positive=positive).fit(X, y) + y_pred = lr.predict(X) + if positive: + # Verify that all coefficients are non-negative + assert np.all(lr.coef_ >= 0), "Not all coefficients are non-negative" diff --git a/python/cuml/cuml/tests/experimental/accel/estimators_hyperparams/test_logistic_regression.py b/python/cuml/cuml/tests/experimental/accel/estimators_hyperparams/test_logistic_regression.py new file mode 100644 index 0000000000..d0c93d2c30 --- /dev/null +++ b/python/cuml/cuml/tests/experimental/accel/estimators_hyperparams/test_logistic_regression.py @@ -0,0 +1,195 @@ +# +# Copyright (c) 2024, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +import pytest +import numpy as np +from sklearn.datasets import make_classification +from sklearn.linear_model import LogisticRegression +from sklearn.metrics import accuracy_score + + +@pytest.fixture(scope="module") +def classification_data(): + X, y = make_classification( + n_samples=200, + n_features=20, + n_classes=3, + n_informative=10, + random_state=42, + ) + return X, y + + +@pytest.mark.parametrize( + "penalty, solver", + [ + ("l1", "liblinear"), + ("l1", "saga"), + ("l2", "lbfgs"), + ("l2", "liblinear"), + ("l2", "sag"), + ("l2", "saga"), + ("elasticnet", "saga"), + (None, "lbfgs"), + (None, "saga"), + ], +) +def test_logistic_regression_penalty(classification_data, penalty, solver): + X, y = classification_data + kwargs = {"penalty": penalty, "solver": solver, "max_iter": 200} + if penalty == "elasticnet": + kwargs["l1_ratio"] = 0.5 # l1_ratio is required for elasticnet + clf = LogisticRegression(**kwargs).fit(X, y) + y_pred = clf.predict(X) + acc = accuracy_score(y, y_pred) + + +@pytest.mark.parametrize("dual", [True, False]) +def test_logistic_regression_dual(classification_data, dual): + X, y = classification_data + # 'dual' is only applicable when 'penalty' is 'l2' and 'solver' is 'liblinear' + if dual: + clf = LogisticRegression( + penalty="l2", solver="liblinear", dual=dual, max_iter=200 + ).fit(X, y) + else: + clf = LogisticRegression( + penalty="l2", solver="liblinear", dual=dual, max_iter=200 + ).fit(X, y) + y_pred = clf.predict(X) + acc = accuracy_score(y, y_pred) + + +@pytest.mark.parametrize("tol", [1e-2]) +def test_logistic_regression_tol(classification_data, tol): + X, y = classification_data + clf = LogisticRegression(tol=tol, max_iter=200).fit(X, y) + y_pred = clf.predict(X) + acc = accuracy_score(y, y_pred) + + +@pytest.mark.parametrize("C", [0.01, 0.1, 1.0, 10.0, 100.0]) +def test_logistic_regression_C(classification_data, C): + X, y = classification_data + clf = LogisticRegression(C=C, max_iter=200).fit(X, y) + y_pred = clf.predict(X) + acc = accuracy_score(y, y_pred) + + +@pytest.mark.parametrize("fit_intercept", [True, False]) +def test_logistic_regression_fit_intercept(classification_data, fit_intercept): + X, y = classification_data + clf = LogisticRegression(fit_intercept=fit_intercept, max_iter=200).fit( + X, y + ) + y_pred = clf.predict(X) + acc = accuracy_score(y, y_pred) + + +@pytest.mark.parametrize("intercept_scaling", [0.5, 1.0, 2.0]) +def test_logistic_regression_intercept_scaling( + classification_data, intercept_scaling +): + X, y = classification_data + # 'intercept_scaling' is only used when solver='liblinear' and fit_intercept=True + clf = LogisticRegression( + solver="liblinear", + fit_intercept=True, + intercept_scaling=intercept_scaling, + max_iter=200, + ).fit(X, y) + y_pred = clf.predict(X) + acc = accuracy_score(y, y_pred) + + +@pytest.mark.parametrize("class_weight", [None, "balanced"]) +def test_logistic_regression_class_weight(classification_data, class_weight): + X, y = classification_data + clf = LogisticRegression(class_weight=class_weight, max_iter=200).fit(X, y) + y_pred = clf.predict(X) + acc = accuracy_score(y, y_pred) + + +def test_logistic_regression_class_weight_custom(classification_data): + X, y = classification_data + class_weights = {0: 1, 1: 2, 2: 1} + clf = LogisticRegression(class_weight=class_weights, max_iter=200).fit( + X, y + ) + y_pred = clf.predict(X) + acc = accuracy_score(y, y_pred) + + +@pytest.mark.parametrize( + "solver", ["newton-cg", "lbfgs", "liblinear", "sag", "saga"] +) +def test_logistic_regression_solver(classification_data, solver): + X, y = classification_data + clf = LogisticRegression(solver=solver, max_iter=200).fit(X, y) + y_pred = clf.predict(X) + acc = accuracy_score(y, y_pred) + + +@pytest.mark.parametrize("max_iter", [50, 100, 200, 500]) +def test_logistic_regression_max_iter(classification_data, max_iter): + X, y = classification_data + clf = LogisticRegression(max_iter=max_iter).fit(X, y) + y_pred = clf.predict(X) + acc = accuracy_score(y, y_pred) + + +@pytest.mark.parametrize( + "multi_class, solver", + [ + ("ovr", "liblinear"), + ("ovr", "lbfgs"), + ("multinomial", "lbfgs"), + ("multinomial", "newton-cg"), + ("multinomial", "sag"), + ("multinomial", "saga"), + ("auto", "lbfgs"), + ("auto", "liblinear"), + ], +) +def test_logistic_regression_multi_class( + classification_data, multi_class, solver +): + X, y = classification_data + if solver == "liblinear" and multi_class == "multinomial": + pytest.skip("liblinear does not support multinomial multi_class") + clf = LogisticRegression( + multi_class=multi_class, solver=solver, max_iter=200 + ).fit(X, y) + y_pred = clf.predict(X) + acc = accuracy_score(y, y_pred) + + +@pytest.mark.parametrize("warm_start", [True, False]) +def test_logistic_regression_warm_start(classification_data, warm_start): + X, y = classification_data + clf = LogisticRegression(warm_start=warm_start, max_iter=200).fit(X, y) + y_pred = clf.predict(X) + acc = accuracy_score(y, y_pred) + + +@pytest.mark.parametrize("l1_ratio", [0.0, 0.5, 1.0]) +def test_logistic_regression_l1_ratio(classification_data, l1_ratio): + X, y = classification_data + clf = LogisticRegression( + penalty="elasticnet", solver="saga", l1_ratio=l1_ratio, max_iter=200 + ).fit(X, y) + y_pred = clf.predict(X) + acc = accuracy_score(y, y_pred) diff --git a/python/cuml/cuml/tests/experimental/accel/estimators_hyperparams/test_nearest_neighbors.py b/python/cuml/cuml/tests/experimental/accel/estimators_hyperparams/test_nearest_neighbors.py new file mode 100644 index 0000000000..2144b06b63 --- /dev/null +++ b/python/cuml/cuml/tests/experimental/accel/estimators_hyperparams/test_nearest_neighbors.py @@ -0,0 +1,232 @@ +# +# Copyright (c) 2024, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +import pytest +import numpy as np +from sklearn.datasets import make_blobs +from sklearn.neighbors import NearestNeighbors +from sklearn.preprocessing import StandardScaler +from sklearn.metrics import pairwise_distances + + +@pytest.fixture(scope="module") +def synthetic_data(): + X, y = make_blobs( + n_samples=500, + n_features=10, + centers=5, + cluster_std=1.0, + random_state=42, + ) + # Standardize features + X = StandardScaler().fit_transform(X) + return X, y + + +@pytest.mark.parametrize("n_neighbors", [1, 5, 10, 20]) +def test_nearest_neighbors_n_neighbors(synthetic_data, n_neighbors): + X, _ = synthetic_data + model = NearestNeighbors(n_neighbors=n_neighbors) + model.fit(X) + distances, indices = model.kneighbors(X) + # Check that the correct number of neighbors is returned + assert ( + indices.shape[1] == n_neighbors + ), f"Should return {n_neighbors} neighbors" + + +@pytest.mark.parametrize( + "algorithm", ["auto", "ball_tree", "kd_tree", "brute"] +) +def test_nearest_neighbors_algorithm(synthetic_data, algorithm): + X, _ = synthetic_data + model = NearestNeighbors(algorithm=algorithm) + model.fit(X) + distances, indices = model.kneighbors(X) + # Check that the output shape is correct + assert ( + indices.shape[0] == X.shape[0] + ), f"Number of samples should remain the same with algorithm={algorithm}" + + +@pytest.mark.parametrize( + "metric", ["euclidean", "manhattan", "chebyshev", "minkowski"] +) +def test_nearest_neighbors_metric(synthetic_data, metric): + X, _ = synthetic_data + model = NearestNeighbors(metric=metric) + model.fit(X) + distances, indices = model.kneighbors(X) + # Check that the distances are computed correctly + if metric == "euclidean": + # Verify distances manually for the first sample + manual_distances = np.linalg.norm(X - X[0], axis=1) + np.testing.assert_allclose( + distances[0], + np.sort(manual_distances)[: model.n_neighbors], + err_msg=f"Distances should match manual computation with metric={metric}", + ) + + +@pytest.mark.parametrize("p", [1, 2, 3]) +def test_nearest_neighbors_p_parameter(synthetic_data, p): + X, _ = synthetic_data + model = NearestNeighbors(metric="minkowski", p=p) + model.fit(X) + distances, indices = model.kneighbors(X) + # Check that the distances are computed correctly + manual_distances = np.sum(np.abs(X - X[0]) ** p, axis=1) ** (1 / p) + np.testing.assert_allclose( + distances[0], + np.sort(manual_distances)[: model.n_neighbors], + err_msg=f"Distances should match manual Minkowski computation with p={p}", + ) + + +@pytest.mark.parametrize("leaf_size", [10, 30, 50]) +def test_nearest_neighbors_leaf_size(synthetic_data, leaf_size): + X, _ = synthetic_data + model = NearestNeighbors(leaf_size=leaf_size) + model.fit(X) + # There's no direct effect on the output, but we can check that the parameter is set + assert model.leaf_size == leaf_size, f"Leaf size should be {leaf_size}" + + +@pytest.mark.parametrize("n_jobs", [1, -1]) +def test_nearest_neighbors_n_jobs(synthetic_data, n_jobs): + X, _ = synthetic_data + model = NearestNeighbors(n_jobs=n_jobs) + model.fit(X) + # We assume the code runs without error; no direct way to test n_jobs effect + assert True, f"NearestNeighbors ran successfully with n_jobs={n_jobs}" + + +def test_nearest_neighbors_radius(synthetic_data): + X, _ = synthetic_data + radius = 1.0 + model = NearestNeighbors(radius=radius) + model.fit(X) + distances, indices = model.radius_neighbors(X) + # Check that all returned distances are within the radius + for dist in distances: + assert np.all( + dist <= radius + ), f"All distances should be within the radius {radius}" + + +def test_nearest_neighbors_invalid_algorithm(synthetic_data): + X, _ = synthetic_data + with pytest.raises(ValueError): + model = NearestNeighbors(algorithm="invalid_algorithm") + model.fit(X) + + +def test_nearest_neighbors_invalid_metric(synthetic_data): + X, _ = synthetic_data + with pytest.raises(ValueError): + model = NearestNeighbors(metric="invalid_metric") + model.fit(X) + + +def test_nearest_neighbors_kneighbors_graph(synthetic_data): + X, _ = synthetic_data + n_neighbors = 5 + model = NearestNeighbors(n_neighbors=n_neighbors) + model.fit(X) + graph = model.kneighbors_graph(X) + # Check that the graph is of correct shape and type + assert graph.shape == ( + X.shape[0], + X.shape[0], + ), "Graph shape should be (n_samples, n_samples)" + assert graph.getformat() == "csr", "Graph should be in CSR format" + # Check that each row has n_neighbors non-zero entries + row_counts = np.diff(graph.indptr) + assert np.all( + row_counts == n_neighbors + ), f"Each sample should have {n_neighbors} neighbors in the graph" + + +def test_nearest_neighbors_radius_neighbors_graph(synthetic_data): + X, _ = synthetic_data + radius = 1.0 + model = NearestNeighbors(radius=radius) + model.fit(X) + graph = model.radius_neighbors_graph(X) + # Check that the graph is of correct shape and type + assert graph.shape == ( + X.shape[0], + X.shape[0], + ), "Graph shape should be (n_samples, n_samples)" + assert graph.getformat() == "csr", "Graph should be in CSR format" + # Check that non-zero entries correspond to distances within the radius + non_zero_indices = graph.nonzero() + distances = pairwise_distances( + X[non_zero_indices[0]], X[non_zero_indices[1]] + ) + + +@pytest.mark.parametrize("return_distance", [True, False]) +def test_nearest_neighbors_return_distance(synthetic_data, return_distance): + X, _ = synthetic_data + model = NearestNeighbors() + model.fit(X) + result = model.kneighbors(X, return_distance=return_distance) + if return_distance: + distances, indices = result + assert ( + distances.shape == indices.shape + ), "Distances and indices should have the same shape" + else: + indices = result + assert indices.shape == ( + X.shape[0], + model.n_neighbors, + ), "Indices shape should match (n_samples, n_neighbors)" + + +def test_nearest_neighbors_no_data(): + with pytest.raises(ValueError): + model = NearestNeighbors() + model.fit(None) + + +def test_nearest_neighbors_sparse_input(): + from scipy.sparse import csr_matrix + + X = csr_matrix(np.random.rand(100, 20)) + model = NearestNeighbors() + model.fit(X) + distances, indices = model.kneighbors(X) + assert distances.shape == ( + X.shape[0], + model.n_neighbors, + ), "Distances shape should match for sparse input" + + +def test_nearest_neighbors_mahalanobis(synthetic_data): + X, _ = synthetic_data + cov = np.cov(X, rowvar=False) + inv_cov = np.linalg.inv(cov) + metric_params = {"VI": inv_cov} + model = NearestNeighbors(metric="mahalanobis", metric_params=metric_params) + model.fit(X) + distances, indices = model.kneighbors(X) + # Check that the distances are computed (cannot easily verify correctness) + assert distances.shape == ( + X.shape[0], + model.n_neighbors, + ), "Distances shape should match" diff --git a/python/cuml/cuml/tests/experimental/accel/estimators_hyperparams/test_pca.py b/python/cuml/cuml/tests/experimental/accel/estimators_hyperparams/test_pca.py new file mode 100644 index 0000000000..ee6d107921 --- /dev/null +++ b/python/cuml/cuml/tests/experimental/accel/estimators_hyperparams/test_pca.py @@ -0,0 +1,164 @@ +# +# Copyright (c) 2024, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +import pytest +import numpy as np +from sklearn.datasets import make_classification +from sklearn.decomposition import PCA +from sklearn.preprocessing import StandardScaler + + +@pytest.fixture(scope="module") +def pca_data(): + X, y = make_classification( + n_samples=300, + n_features=10, + n_informative=5, + n_redundant=0, + n_repeated=0, + random_state=42, + ) + # Standardize features before PCA + X = StandardScaler().fit_transform(X) + return X, y + + +@pytest.mark.parametrize("n_components", [2, 5, 8, 10]) +def test_pca_n_components(pca_data, n_components): + X, _ = pca_data + pca = PCA(n_components=n_components).fit(X) + X_transformed = pca.transform(X) + # Check the shape of the transformed data + assert ( + X_transformed.shape[1] == n_components + ), f"Expected {n_components} components, got {X_transformed.shape[1]}" + # Check that explained variance ratios sum up appropriately + total_variance = np.sum(pca.explained_variance_ratio_) + assert ( + total_variance <= 1.0 + ), "Total explained variance ratio cannot exceed 1" + assert ( + total_variance > 0.0 + ), "Total explained variance ratio should be positive" + + +@pytest.mark.parametrize( + "svd_solver", ["auto", "full", "arpack", "randomized"] +) +def test_pca_svd_solver(pca_data, svd_solver): + X, _ = pca_data + pca = PCA(n_components=5, svd_solver=svd_solver, random_state=42).fit(X) + X_transformed = pca.transform(X) + # Reconstruct the data + X_reconstructed = pca.inverse_transform(X_transformed) + # Check reconstruction error + reconstruction_error = np.mean((X - X_reconstructed) ** 2) + + +@pytest.mark.parametrize("whiten", [True, False]) +def test_pca_whiten(pca_data, whiten): + X, _ = pca_data + pca = PCA(n_components=5, whiten=whiten).fit(X) + X_transformed = pca.transform(X) + # If whiten is True, transformed data should have unit variance + variances = np.var(X_transformed, axis=0) + if whiten: + np.testing.assert_allclose( + variances, + 1.0, + atol=1e-1, + err_msg="Transformed features should have unit variance when whiten=True", + ) + + +@pytest.mark.parametrize("tol", [0.0, 1e-4, 1e-2]) +def test_pca_tol(pca_data, tol): + X, _ = pca_data + pca = PCA( + n_components=5, svd_solver="arpack", tol=tol, random_state=42 + ).fit(X) + X_transformed = pca.transform(X) + # Since 'arpack' is iterative, tol might affect convergence + # Check that the explained variance ratio is reasonable + total_variance = np.sum(pca.explained_variance_ratio_) + assert ( + total_variance > 0.5 + ), "Total explained variance should be significant" + + +def test_pca_random_state(pca_data): + X, _ = pca_data + pca1 = PCA(n_components=5, svd_solver="randomized", random_state=42).fit(X) + pca2 = PCA(n_components=5, svd_solver="randomized", random_state=42).fit(X) + # With the same random_state, components should be the same + np.testing.assert_allclose( + pca1.components_, + pca2.components_, + err_msg="Components should be the same with the same random_state", + ) + pca3 = PCA(n_components=5, svd_solver="randomized", random_state=24).fit(X) + # With different random_state, components might differ + + +@pytest.mark.parametrize("copy", [True, False]) +def test_pca_copy(pca_data, copy): + X, _ = pca_data + X_original = X.copy() + pca = PCA(n_components=5, copy=copy).fit(X) + if copy: + # X should remain unchanged + assert np.allclose(X, X_original), "X has been modified when copy=True" + else: + # X might be modified when copy=False + pass # We cannot guarantee X remains unchanged + + +@pytest.mark.parametrize("iterated_power", [0, 3, 5, "auto"]) +def test_pca_iterated_power(pca_data, iterated_power): + X, _ = pca_data + pca = PCA( + n_components=5, + svd_solver="randomized", + iterated_power=iterated_power, + random_state=42, + ).fit(X) + X_transformed = pca.transform(X) + # Check that the explained variance ratio is reasonable + total_variance = np.sum(pca.explained_variance_ratio_) + assert ( + total_variance > 0.5 + ), f"Total explained variance should be significant with iterated_power={iterated_power}" + + +def test_pca_explained_variance_ratio(pca_data): + X, _ = pca_data + pca = PCA(n_components=None).fit(X) + total_variance = np.sum(pca.explained_variance_ratio_) + np.testing.assert_almost_equal( + total_variance, + 1.0, + decimal=5, + err_msg="Total explained variance ratio should sum to 1 when n_components=None", + ) + + +def test_pca_inverse_transform(pca_data): + X, _ = pca_data + pca = PCA(n_components=5).fit(X) + X_transformed = pca.transform(X) + X_reconstructed = pca.inverse_transform(X_transformed) + # Check reconstruction error + reconstruction_error = np.mean((X - X_reconstructed) ** 2) diff --git a/python/cuml/cuml/tests/experimental/accel/estimators_hyperparams/test_ridge.py b/python/cuml/cuml/tests/experimental/accel/estimators_hyperparams/test_ridge.py new file mode 100644 index 0000000000..6223b0f32c --- /dev/null +++ b/python/cuml/cuml/tests/experimental/accel/estimators_hyperparams/test_ridge.py @@ -0,0 +1,163 @@ +# +# Copyright (c) 2024, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + + +import pytest +import numpy as np +from sklearn.datasets import make_regression +from sklearn.linear_model import Ridge +from sklearn.metrics import mean_squared_error, r2_score +from sklearn.preprocessing import StandardScaler + + +@pytest.fixture(scope="module") +def regression_data(): + X, y = make_regression( + n_samples=500, + n_features=20, + n_informative=10, + noise=0.1, + random_state=42, + ) + # Standardize features + X = StandardScaler().fit_transform(X) + return X, y + + +@pytest.mark.parametrize("alpha", [0.1, 1.0, 10.0, 100.0]) +def test_ridge_alpha(regression_data, alpha): + X, y = regression_data + model = Ridge(alpha=alpha, random_state=42) + model.fit(X, y) + y_pred = model.predict(X) + # Compute R^2 score + r2 = r2_score(y, y_pred) + assert r2 > 0.5, f"R^2 score should be reasonable for alpha={alpha}" + + +@pytest.mark.parametrize( + "solver", + ["auto", "svd", "cholesky", "lsqr", "sparse_cg", "sag", "saga", "lbfgs"], +) +def test_ridge_solver(regression_data, solver): + X, y = regression_data + positive = solver == "lbfgs" + model = Ridge(solver=solver, random_state=42, positive=positive) + model.fit(X, y) + y_pred = model.predict(X) + # Compute R^2 score + r2 = r2_score(y, y_pred) + assert r2 > 0.5, f"R^2 score should be reasonable with solver={solver}" + + +@pytest.mark.parametrize("max_iter", [100, 500, 1000]) +def test_ridge_max_iter(regression_data, max_iter): + X, y = regression_data + model = Ridge(max_iter=max_iter, solver="sag", random_state=42) + model.fit(X, y) + assert ( + model.n_iter_ <= max_iter + ), "Number of iterations should not exceed max_iter" + + +@pytest.mark.parametrize("tol", [1e-4, 1e-3, 1e-2]) +def test_ridge_tol(regression_data, tol): + X, y = regression_data + model = Ridge(tol=tol, solver="sag", random_state=42) + model.fit(X, y) + y_pred = model.predict(X) + # Compute R^2 score + r2 = r2_score(y, y_pred) + assert r2 > 0.5, f"R^2 score should be reasonable for tol={tol}" + + +@pytest.mark.parametrize("fit_intercept", [True, False]) +def test_ridge_fit_intercept(regression_data, fit_intercept): + X, y = regression_data + model = Ridge(fit_intercept=fit_intercept, random_state=42) + model.fit(X, y) + y_pred = model.predict(X) + # Compute R^2 score + r2 = r2_score(y, y_pred) + assert ( + r2 > 0.5 + ), f"R^2 score should be reasonable with fit_intercept={fit_intercept}" + + +def test_ridge_random_state(regression_data): + X, y = regression_data + model1 = Ridge(solver="sag", random_state=42) + model1.fit(X, y) + model2 = Ridge(solver="sag", random_state=42) + model2.fit(X, y) + # Coefficients should be the same when random_state is fixed + np.testing.assert_allclose( + model1.coef_, + model2.coef_, + err_msg="Coefficients should be the same with the same random_state", + ) + model3 = Ridge(solver="sag", random_state=24) + model3.fit(X, y) + # Coefficients might differ with a different random_state + with pytest.raises(AssertionError): + np.testing.assert_allclose( + model1.coef_, + model3.coef_, + err_msg="Coefficients should differ with different random_state", + ) + + +@pytest.mark.parametrize("copy_X", [True, False]) +def test_ridge_copy_X(regression_data, copy_X): + X, y = regression_data + X_original = X.copy() + model = Ridge(copy_X=copy_X, random_state=42) + model.fit(X, y) + if copy_X: + # X should remain unchanged + assert np.allclose( + X, X_original + ), "X has been modified when copy_X=True" + else: + # X might be modified when copy_X=False + pass # We cannot guarantee X remains unchanged + + +def test_ridge_convergence_warning(regression_data): + X, y = regression_data + from sklearn.exceptions import ConvergenceWarning + + with pytest.warns(ConvergenceWarning): + model = Ridge(max_iter=1, solver="sag", random_state=42) + model.fit(X, y) + + +def test_ridge_coefficients(regression_data): + X, y = regression_data + model = Ridge(alpha=1.0, random_state=42) + model.fit(X, y) + coef_nonzero = np.sum(model.coef_ != 0) + assert coef_nonzero > 0, "There should be non-zero coefficients" + + +def test_ridge_positive(regression_data): + X, y = regression_data + model = Ridge(positive=True, solver="lbfgs", random_state=42) + model.fit(X, y) + # All coefficients should be non-negative + assert np.all( + model.coef_ >= 0 + ), "All coefficients should be non-negative when positive=True" diff --git a/python/cuml/cuml/tests/experimental/accel/estimators_hyperparams/test_tsne.py b/python/cuml/cuml/tests/experimental/accel/estimators_hyperparams/test_tsne.py new file mode 100644 index 0000000000..1c8f145c75 --- /dev/null +++ b/python/cuml/cuml/tests/experimental/accel/estimators_hyperparams/test_tsne.py @@ -0,0 +1,195 @@ +# +# Copyright (c) 2024, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +import pytest +import numpy as np +from sklearn.datasets import make_classification +from sklearn.manifold import TSNE +from sklearn.metrics import pairwise_distances +from sklearn.preprocessing import StandardScaler + + +@pytest.fixture(scope="module") +def synthetic_data(): + X, y = make_classification( + n_samples=100, + n_features=20, + n_informative=10, + n_redundant=10, + n_clusters_per_class=1, + n_classes=5, + random_state=42, + ) + # Standardize features + X = StandardScaler().fit_transform(X) + return X, y + + +@pytest.mark.parametrize("n_components", [2, 3]) +def test_tsne_n_components(synthetic_data, n_components): + X, _ = synthetic_data + model = TSNE(n_components=n_components, random_state=42) + X_embedded = model.fit_transform(X) + assert ( + X_embedded.shape[1] == n_components + ), f"Output dimensions should be {n_components}" + + +@pytest.mark.parametrize("perplexity", [50]) +def test_tsne_perplexity(synthetic_data, perplexity): + X, _ = synthetic_data + model = TSNE(perplexity=perplexity, random_state=42) + X_embedded = model.fit_transform(X) + # Check that the embedding has the correct shape + assert ( + X_embedded.shape[0] == X.shape[0] + ), "Number of samples should remain the same" + + +@pytest.mark.parametrize("early_exaggeration", [12.0]) +def test_tsne_early_exaggeration(synthetic_data, early_exaggeration): + X, _ = synthetic_data + model = TSNE(early_exaggeration=early_exaggeration, random_state=42) + X_embedded = model.fit_transform(X) + # Check that the embedding has the correct shape + assert ( + X_embedded.shape[0] == X.shape[0] + ), "Number of samples should remain the same" + + +@pytest.mark.parametrize("learning_rate", [200]) +def test_tsne_learning_rate(synthetic_data, learning_rate): + X, _ = synthetic_data + model = TSNE(learning_rate=learning_rate, random_state=42) + X_embedded = model.fit_transform(X) + # Check that the embedding has the correct shape + assert ( + X_embedded.shape[0] == X.shape[0] + ), "Number of samples should remain the same" + + +@pytest.mark.parametrize("n_iter", [250]) +def test_tsne_n_iter(synthetic_data, n_iter): + X, _ = synthetic_data + model = TSNE(n_iter=n_iter, random_state=42) + model.fit_transform(X) + # Since TSNE may perform additional iterations, check if n_iter_ is at least n_iter + assert ( + model.n_iter_ >= n_iter + ), f"Number of iterations should be at least {n_iter}" + + +@pytest.mark.parametrize("metric", ["euclidean", "manhattan", "cosine"]) +def test_tsne_metric(synthetic_data, metric): + X, _ = synthetic_data + model = TSNE(metric=metric, random_state=42) + X_embedded = model.fit_transform(X) + # Check that the embedding has the correct shape + assert ( + X_embedded.shape[0] == X.shape[0] + ), f"Embedding should have same number of samples with metric={metric}" + + +@pytest.mark.parametrize("init", ["random", "pca"]) +def test_tsne_init(synthetic_data, init): + X, _ = synthetic_data + model = TSNE(init=init, random_state=42) + X_embedded = model.fit_transform(X) + # Check that the embedding has the correct shape + assert ( + X_embedded.shape[0] == X.shape[0] + ), f"Embedding should have same number of samples with init={init}" + + +@pytest.mark.parametrize("method", ["barnes_hut", "exact"]) +def test_tsne_method(synthetic_data, method): + X, _ = synthetic_data + model = TSNE(method=method, random_state=42) + X_embedded = model.fit_transform(X) + # Check that the embedding has the correct shape + assert ( + X_embedded.shape[0] == X.shape[0] + ), f"Embedding should have same number of samples with method={method}" + + +@pytest.mark.parametrize("angle", [0.2]) +def test_tsne_angle(synthetic_data, angle): + X, _ = synthetic_data + model = TSNE(method="barnes_hut", angle=angle, random_state=42) + X_embedded = model.fit_transform(X) + # Check that the angle parameter is set correctly + assert model.angle == angle, f"Angle should be {angle}" + + +def test_tsne_random_state(synthetic_data): + X, _ = synthetic_data + model1 = TSNE(random_state=42) + X_embedded1 = model1.fit_transform(X) + model2 = TSNE(random_state=42) + X_embedded2 = model2.fit_transform(X) + # The embeddings should be the same when random_state is fixed + np.testing.assert_allclose( + X_embedded1, + X_embedded2, + atol=1e-5, + err_msg="Embeddings should be the same with the same random_state", + ) + model3 = TSNE(random_state=24) + X_embedded3 = model3.fit_transform(X) + + +def test_tsne_verbose(synthetic_data, capsys): + X, _ = synthetic_data + model = TSNE(verbose=1, random_state=42) + model.fit_transform(X) + captured = capsys.readouterr() + # Check that there is output when verbose=1 + assert len(captured.out) > 0, "There should be output when verbose=1" + + +def test_tsne_structure_preservation(synthetic_data): + X, y = synthetic_data + model = TSNE(random_state=42) + X_embedded = model.fit_transform(X) + # Compute pairwise distances in original and embedded spaces + dist_original = pairwise_distances(X) + dist_embedded = pairwise_distances(X_embedded) + # Compute correlation between the distances + corr = np.corrcoef(dist_original.ravel(), dist_embedded.ravel())[0, 1] + + +@pytest.mark.parametrize("min_grad_norm", [1e-5]) +def test_tsne_min_grad_norm(synthetic_data, min_grad_norm): + X, _ = synthetic_data + model = TSNE(min_grad_norm=min_grad_norm, random_state=42) + model.fit_transform(X) + # Check that the min_grad_norm parameter is set correctly + assert ( + model.min_grad_norm == min_grad_norm + ), f"min_grad_norm should be {min_grad_norm}" + + +def test_tsne_metric_params(synthetic_data): + X, _ = synthetic_data + metric_params = {"p": 2} + model = TSNE( + metric="minkowski", metric_params=metric_params, random_state=42 + ) + X_embedded = model.fit_transform(X) + # Check that the embedding has the correct shape + assert ( + X_embedded.shape[0] == X.shape[0] + ), "Embedding should have same number of samples with custom metric_params" diff --git a/python/cuml/cuml/tests/experimental/accel/estimators_hyperparams/test_tsvd.py b/python/cuml/cuml/tests/experimental/accel/estimators_hyperparams/test_tsvd.py new file mode 100644 index 0000000000..f2c0a43c63 --- /dev/null +++ b/python/cuml/cuml/tests/experimental/accel/estimators_hyperparams/test_tsvd.py @@ -0,0 +1,187 @@ +# +# Copyright (c) 2024, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +import pytest +import numpy as np +from sklearn.datasets import make_classification +from sklearn.decomposition import TruncatedSVD +from sklearn.preprocessing import StandardScaler +from scipy.sparse import csr_matrix + + +@pytest.fixture(scope="module") +def svd_data(): + X, y = make_classification( + n_samples=300, + n_features=50, + n_informative=10, + n_redundant=10, + random_state=42, + ) + # Convert the data to a sparse CSR matrix + X_sparse = csr_matrix(X) + return X_sparse, y + + +@pytest.mark.parametrize("n_components", [5, 10, 20, 30]) +def test_truncated_svd_n_components(svd_data, n_components): + X_sparse, _ = svd_data + svd = TruncatedSVD(n_components=n_components, random_state=42) + X_transformed = svd.fit_transform(X_sparse) + # Check the shape of the transformed data + assert ( + X_transformed.shape[1] == n_components + ), f"Expected {n_components} components, got {X_transformed.shape[1]}" + # Check that explained variance ratios sum up appropriately + total_variance = np.sum(svd.explained_variance_ratio_) + assert ( + total_variance <= 1.0 + ), "Total explained variance ratio cannot exceed 1" + assert ( + total_variance > 0.0 + ), "Total explained variance ratio should be positive" + + +@pytest.mark.parametrize("algorithm", ["randomized", "arpack"]) +def test_truncated_svd_algorithm(svd_data, algorithm): + X_sparse, _ = svd_data + svd = TruncatedSVD(n_components=10, algorithm=algorithm, random_state=42) + X_transformed = svd.fit_transform(X_sparse) + # Reconstruct the data + X_reconstructed = svd.inverse_transform(X_transformed) + # Since TruncatedSVD doesn't center data, we compare the approximation + reconstruction_error = np.mean((X_sparse.toarray() - X_reconstructed) ** 2) + + +@pytest.mark.parametrize("n_iter", [5, 7, 10]) +def test_truncated_svd_n_iter(svd_data, n_iter): + X_sparse, _ = svd_data + svd = TruncatedSVD(n_components=10, n_iter=n_iter, random_state=42) + X_transformed = svd.fit_transform(X_sparse) + # Check that the explained variance ratio is reasonable + total_variance = np.sum(svd.explained_variance_ratio_) + assert ( + total_variance > 0.5 + ), f"Total explained variance should be significant with n_iter={n_iter}" + + +def test_truncated_svd_random_state(svd_data): + X_sparse, _ = svd_data + svd1 = TruncatedSVD( + n_components=10, algorithm="randomized", random_state=42 + ) + svd2 = TruncatedSVD( + n_components=10, algorithm="randomized", random_state=42 + ) + X_transformed1 = svd1.fit_transform(X_sparse) + X_transformed2 = svd2.fit_transform(X_sparse) + # With the same random_state, components should be the same + np.testing.assert_allclose( + svd1.components_, + svd2.components_, + err_msg="Components should be the same with the same random_state", + ) + svd3 = TruncatedSVD( + n_components=10, algorithm="randomized", random_state=24 + ) + svd3.fit(X_sparse) + # With different random_state, components might differ + with pytest.raises(AssertionError): + np.testing.assert_allclose( + svd1.components_, + svd3.components_, + err_msg="Components should differ with different random_state", + ) + + +@pytest.mark.parametrize("tol", [0.0, 1e-4, 1e-2]) +def test_truncated_svd_tol(svd_data, tol): + X_sparse, _ = svd_data + svd = TruncatedSVD( + n_components=10, algorithm="arpack", tol=tol, random_state=42 + ) + X_transformed = svd.fit_transform(X_sparse) + # Check that the explained variance ratio is reasonable + total_variance = np.sum(svd.explained_variance_ratio_) + assert ( + total_variance > 0.5 + ), f"Total explained variance should be significant with tol={tol}" + + +@pytest.mark.parametrize( + "power_iteration_normalizer", ["auto", "OR", "LU", "none"] +) +def test_truncated_svd_power_iteration_normalizer( + svd_data, power_iteration_normalizer +): + X_sparse, _ = svd_data + svd = TruncatedSVD( + n_components=10, + power_iteration_normalizer=power_iteration_normalizer, + random_state=42, + ) + X_transformed = svd.fit_transform(X_sparse) + # Check that the explained variance ratio is reasonable + total_variance = np.sum(svd.explained_variance_ratio_) + assert ( + total_variance > 0.5 + ), f"Total explained variance should be significant with power_iteration_normalizer={power_iteration_normalizer}" + + +def test_truncated_svd_inverse_transform(svd_data): + X_sparse, _ = svd_data + svd = TruncatedSVD(n_components=10, random_state=42) + X_transformed = svd.fit_transform(X_sparse) + X_reconstructed = svd.inverse_transform(X_transformed) + # Check reconstruction error + reconstruction_error = np.mean((X_sparse.toarray() - X_reconstructed) ** 2) + + +def test_truncated_svd_sparse_input_dense_output(svd_data): + X_sparse, _ = svd_data + svd = TruncatedSVD(n_components=10, random_state=42) + X_transformed = svd.fit_transform(X_sparse) + # The output should be dense even if input is sparse + assert not isinstance( + X_transformed, csr_matrix + ), "Transformed data should be dense" + + +def test_truncated_svd_components_norm(svd_data): + X_sparse, _ = svd_data + svd = TruncatedSVD(n_components=10, random_state=42) + svd.fit(X_sparse) + components_norm = np.linalg.norm(svd.components_, axis=1) + np.testing.assert_allclose( + components_norm, + 1.0, + atol=1e-5, + err_msg="Each component should have unit length", + ) + + +@pytest.mark.parametrize("n_oversamples", [5, 10, 15]) +def test_truncated_svd_n_oversamples(svd_data, n_oversamples): + X_sparse, _ = svd_data + svd = TruncatedSVD( + n_components=10, n_oversamples=n_oversamples, random_state=42 + ) + X_transformed = svd.fit_transform(X_sparse) + # Check that the explained variance ratio is reasonable + total_variance = np.sum(svd.explained_variance_ratio_) + assert ( + total_variance > 0.5 + ), f"Total explained variance should be significant with n_oversamples={n_oversamples}" diff --git a/python/cuml/cuml/tests/experimental/accel/estimators_hyperparams/test_umap.py b/python/cuml/cuml/tests/experimental/accel/estimators_hyperparams/test_umap.py new file mode 100644 index 0000000000..1f9ab6ac5b --- /dev/null +++ b/python/cuml/cuml/tests/experimental/accel/estimators_hyperparams/test_umap.py @@ -0,0 +1,181 @@ +# +# Copyright (c) 2024, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the “License”); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an “AS IS” BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +import pytest +import numpy as np +from sklearn.datasets import make_swiss_roll +from umap import UMAP +from sklearn.manifold import trustworthiness + + +@pytest.fixture(scope="module") +def manifold_data(): + X, _ = make_swiss_roll(n_samples=100, noise=0.05, random_state=42) + return X + + +@pytest.mark.parametrize("n_neighbors", [5]) +def test_umap_n_neighbors(manifold_data, n_neighbors): + X = manifold_data + umap = UMAP(n_neighbors=n_neighbors, random_state=42) + X_embedded = umap.fit_transform(X) + trust = trustworthiness(X, X_embedded, n_neighbors=5) + print(f"Trustworthiness with n_neighbors={n_neighbors}: {trust}") + + +@pytest.mark.parametrize("min_dist", [0.0, 0.5]) +def test_umap_min_dist(manifold_data, min_dist): + X = manifold_data + umap = UMAP(min_dist=min_dist, random_state=42) + X_embedded = umap.fit_transform(X) + trust = trustworthiness(X, X_embedded, n_neighbors=5) + print(f"Trustworthiness with min_dist={min_dist}: {trust}") + + +@pytest.mark.parametrize( + "metric", ["euclidean", "manhattan", "chebyshev", "cosine"] +) +def test_umap_metric(manifold_data, metric): + X = manifold_data + umap = UMAP(metric=metric, random_state=42) + X_embedded = umap.fit_transform(X) + trust = trustworthiness(X, X_embedded, n_neighbors=5) + print(f"Trustworthiness with metric={metric}: {trust}") + + +@pytest.mark.parametrize("n_components", [2, 3]) +def test_umap_n_components(manifold_data, n_components): + X = manifold_data + umap = UMAP(n_components=n_components, random_state=42) + X_embedded = umap.fit_transform(X) + trust = trustworthiness(X, X_embedded, n_neighbors=5) + print(f"Trustworthiness with n_components={n_components}: {trust}") + + +@pytest.mark.parametrize("spread", [0.5, 1.5]) +def test_umap_spread(manifold_data, spread): + X = manifold_data + umap = UMAP(spread=spread, random_state=42) + X_embedded = umap.fit_transform(X) + trust = trustworthiness(X, X_embedded, n_neighbors=5) + print(f"Trustworthiness with spread={spread}: {trust}") + + +@pytest.mark.parametrize("negative_sample_rate", [5]) +def test_umap_negative_sample_rate(manifold_data, negative_sample_rate): + X = manifold_data + umap = UMAP(negative_sample_rate=negative_sample_rate, random_state=42) + X_embedded = umap.fit_transform(X) + trust = trustworthiness(X, X_embedded, n_neighbors=5) + print( + f"Trustworthiness with negative_sample_rate={negative_sample_rate}: {trust}" + ) + + +@pytest.mark.parametrize("learning_rate", [0.1, 10.0]) +def test_umap_learning_rate(manifold_data, learning_rate): + X = manifold_data + umap = UMAP(learning_rate=learning_rate, random_state=42) + X_embedded = umap.fit_transform(X) + trust = trustworthiness(X, X_embedded, n_neighbors=5) + print(f"Trustworthiness with learning_rate={learning_rate}: {trust}") + + +@pytest.mark.parametrize("init", ["spectral", "random"]) +def test_umap_init(manifold_data, init): + X = manifold_data + umap = UMAP(init=init, random_state=42) + X_embedded = umap.fit_transform(X) + trust = trustworthiness(X, X_embedded, n_neighbors=5) + print(f"Trustworthiness with init={init}: {trust}") + + +def test_umap_consistency(manifold_data): + X = manifold_data + umap1 = UMAP(random_state=42).fit(X) + umap2 = UMAP(random_state=42).fit(X) + assert np.allclose( + umap1.embedding_, umap2.embedding_ + ), "Embeddings should be consistent across runs with the same random_state" + + +@pytest.mark.parametrize("n_epochs", [100, 200, 500]) +def test_umap_n_epochs(manifold_data, n_epochs): + X = manifold_data + umap = UMAP(n_epochs=n_epochs, random_state=42) + X_embedded = umap.fit_transform(X) + trust = trustworthiness(X, X_embedded, n_neighbors=5) + print(f"Trustworthiness with n_epochs={n_epochs}: {trust}") + + +@pytest.mark.parametrize("local_connectivity", [1, 2, 5]) +def test_umap_local_connectivity(manifold_data, local_connectivity): + X = manifold_data + umap = UMAP(local_connectivity=local_connectivity, random_state=42) + X_embedded = umap.fit_transform(X) + trust = trustworthiness(X, X_embedded, n_neighbors=5) + print( + f"Trustworthiness with local_connectivity={local_connectivity}: {trust}" + ) + + +@pytest.mark.parametrize("repulsion_strength", [0.5, 1.0, 2.0]) +def test_umap_repulsion_strength(manifold_data, repulsion_strength): + X = manifold_data + umap = UMAP(repulsion_strength=repulsion_strength, random_state=42) + X_embedded = umap.fit_transform(X) + trust = trustworthiness(X, X_embedded, n_neighbors=5) + print( + f"Trustworthiness with repulsion_strength={repulsion_strength}: {trust}" + ) + + +@pytest.mark.parametrize("metric_kwds", [{"p": 1}, {"p": 2}, {"p": 3}]) +def test_umap_metric_kwds(manifold_data, metric_kwds): + X = manifold_data + umap = UMAP(metric="minkowski", metric_kwds=metric_kwds, random_state=42) + X_embedded = umap.fit_transform(X) + trust = trustworthiness(X, X_embedded, n_neighbors=5) + print(f"Trustworthiness with metric_kwds={metric_kwds}: {trust}") + + +@pytest.mark.parametrize("angular_rp_forest", [True, False]) +def test_umap_angular_rp_forest(manifold_data, angular_rp_forest): + X = manifold_data + umap = UMAP(angular_rp_forest=angular_rp_forest, random_state=42) + X_embedded = umap.fit_transform(X) + trust = trustworthiness(X, X_embedded, n_neighbors=5) + print( + f"Trustworthiness with angular_rp_forest={angular_rp_forest}: {trust}" + ) + + +@pytest.mark.parametrize("densmap", [True, False]) +def test_umap_densmap(manifold_data, densmap): + X = manifold_data + umap = UMAP(densmap=densmap, random_state=42) + X_embedded = umap.fit_transform(X) + trust = trustworthiness(X, X_embedded, n_neighbors=5) + print(f"Trustworthiness with densmap={densmap}: {trust}") + + +@pytest.mark.parametrize("output_metric", ["euclidean", "manhattan"]) +def test_umap_output_metric(manifold_data, output_metric): + X = manifold_data + umap = UMAP(output_metric=output_metric, random_state=42) + X_embedded = umap.fit_transform(X) + trust = trustworthiness(X, X_embedded, n_neighbors=5) + print(f"Trustworthiness with output_metric={output_metric}: {trust}") diff --git a/python/cuml/cuml/tests/experimental/accel/test_basic_estimators.py b/python/cuml/cuml/tests/experimental/accel/test_basic_estimators.py new file mode 100644 index 0000000000..63f8ca0e51 --- /dev/null +++ b/python/cuml/cuml/tests/experimental/accel/test_basic_estimators.py @@ -0,0 +1,142 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pytest +import numpy as np +from sklearn.datasets import make_classification, make_regression, make_blobs +from sklearn.linear_model import ( + LinearRegression, + LogisticRegression, + ElasticNet, + Ridge, + Lasso, +) +from sklearn.cluster import KMeans, DBSCAN +from sklearn.decomposition import PCA, TruncatedSVD +from sklearn.kernel_ridge import KernelRidge +from sklearn.manifold import TSNE +from sklearn.neighbors import ( + NearestNeighbors, + KNeighborsClassifier, + KNeighborsRegressor, +) +from sklearn.metrics import ( + mean_squared_error, + r2_score, + adjusted_rand_score, + accuracy_score, +) +from scipy.sparse import random as sparse_random + + +def test_kmeans(): + X, y_true = make_blobs(n_samples=100, centers=3, random_state=42) + clf = KMeans().fit(X) + y_pred = clf.predict(X) + + +def test_dbscan(): + X, y_true = make_blobs(n_samples=100, centers=3, random_state=42) + clf = DBSCAN().fit(X) + y_pred = clf.labels_ + + +def test_pca(): + X, _ = make_blobs(n_samples=100, centers=3, random_state=42) + pca = PCA().fit(X) + X_transformed = pca.transform(X) + + +def test_truncated_svd(): + X, _ = make_blobs(n_samples=100, centers=3, random_state=42) + svd = TruncatedSVD().fit(X) + X_transformed = svd.transform(X) + + +def test_linear_regression(): + X, y = make_regression( + n_samples=100, n_features=20, noise=0.1, random_state=42 + ) + lr = LinearRegression().fit(X, y) + y_pred = lr.predict(X) + + +def test_logistic_regression(): + X, y = make_classification( + n_samples=100, n_features=20, n_classes=2, random_state=42 + ) + clf = LogisticRegression().fit(X, y) + y_pred = clf.predict(X) + + +def test_elastic_net(): + X, y = make_regression( + n_samples=100, n_features=20, noise=0.1, random_state=42 + ) + enet = ElasticNet().fit(X, y) + y_pred = enet.predict(X) + + +def test_ridge(): + X, y = make_regression( + n_samples=100, n_features=20, noise=0.1, random_state=42 + ) + ridge = Ridge().fit(X, y) + y_pred = ridge.predict(X) + + +def test_lasso(): + X, y = make_regression( + n_samples=100, n_features=20, noise=0.1, random_state=42 + ) + lasso = Lasso().fit(X, y) + y_pred = lasso.predict(X) + + +def test_tsne(): + X, _ = make_blobs(n_samples=100, centers=3, n_features=20, random_state=42) + tsne = TSNE() + X_embedded = tsne.fit_transform(X) + + +def test_nearest_neighbors(): + X, _ = make_blobs(n_samples=100, centers=3, n_features=20, random_state=42) + nn = NearestNeighbors().fit(X) + distances, indices = nn.kneighbors(X) + assert distances.shape == (100, 5) + assert indices.shape == (100, 5) + + +def test_k_neighbors_classifier(): + X, y = make_classification( + n_samples=100, + n_features=20, + n_classes=3, + random_state=42, + n_informative=6, + ) + for weights in ["uniform", "distance"]: + for metric in ["euclidean", "manhattan"]: + knn = KNeighborsClassifier().fit(X, y) + y_pred = knn.predict(X) + + +def test_k_neighbors_regressor(): + X, y = make_regression( + n_samples=100, n_features=20, noise=0.1, random_state=42 + ) + for weights in ["uniform", "distance"]: + for metric in ["euclidean", "manhattan"]: + knr = KNeighborsRegressor().fit(X, y) + y_pred = knr.predict(X) diff --git a/python/cuml/cuml/tests/experimental/accel/test_optuna.py b/python/cuml/cuml/tests/experimental/accel/test_optuna.py new file mode 100644 index 0000000000..6471a58ebf --- /dev/null +++ b/python/cuml/cuml/tests/experimental/accel/test_optuna.py @@ -0,0 +1,146 @@ +# +# Copyright (c) 2024, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +import pytest +import optuna +from sklearn.datasets import make_classification, make_regression +from sklearn.model_selection import train_test_split, cross_val_score +from sklearn.cluster import KMeans, DBSCAN +from sklearn.decomposition import PCA, TruncatedSVD +from sklearn.kernel_ridge import KernelRidge +from sklearn.linear_model import ( + LinearRegression, + LogisticRegression, + ElasticNet, + Ridge, + Lasso, +) +from sklearn.manifold import TSNE +from sklearn.neighbors import ( + NearestNeighbors, + KNeighborsClassifier, + KNeighborsRegressor, +) +import umap +import hdbscan + + +@pytest.fixture +def classification_data(): + X, y = make_classification(n_samples=100, n_features=10, random_state=42) + return train_test_split(X, y, test_size=0.2, random_state=42) + + +@pytest.fixture +def regression_data(): + X, y = make_regression( + n_samples=100, n_features=10, noise=0.1, random_state=42 + ) + return train_test_split(X, y, test_size=0.2, random_state=42) + + +def objective(trial, estimator, X_train, y_train): + params = {} + if hasattr(estimator, "C"): + params["C"] = trial.suggest_loguniform("C", 1e-3, 1e2) + if hasattr(estimator, "alpha"): + params["alpha"] = trial.suggest_loguniform("alpha", 1e-3, 1e2) + if hasattr(estimator, "l1_ratio"): + params["l1_ratio"] = trial.suggest_uniform("l1_ratio", 0.0, 1.0) + if hasattr(estimator, "n_neighbors"): + params["n_neighbors"] = trial.suggest_int("n_neighbors", 1, 15) + model = estimator.set_params(**params) + score = cross_val_score(model, X_train, y_train, cv=3).mean() + return score + + +@pytest.mark.parametrize( + "estimator", + [ + LogisticRegression(), + KNeighborsClassifier(), + ], +) +def test_classification_models_optuna(estimator, classification_data): + X_train, X_test, y_train, y_test = classification_data + study = optuna.create_study(direction="maximize") + study.optimize( + lambda trial: objective(trial, estimator, X_train, y_train), + n_trials=10, + ) + + assert study.best_value > 0.5, f"Failed to optimize {estimator}" + + +@pytest.mark.parametrize( + "estimator", + [ + LinearRegression(), + Ridge(), + Lasso(), + ElasticNet(), + KernelRidge(), + KNeighborsRegressor(), + ], +) +def test_regression_models_optuna(estimator, regression_data): + X_train, X_test, y_train, y_test = regression_data + study = optuna.create_study(direction="minimize") + study.optimize( + lambda trial: objective(trial, estimator, X_train, y_train), + n_trials=10, + ) + assert study.best_value < 1.0, f"Failed to optimize {estimator}" + + +@pytest.mark.parametrize( + "clustering_method", + [ + KMeans(n_clusters=3, random_state=42), + DBSCAN(), + hdbscan.HDBSCAN(min_cluster_size=5), + ], +) +def test_clustering_models(clustering_method, classification_data): + X_train, X_test, y_train, y_test = classification_data + clustering_method.fit(X_train) + assert True, f"{clustering_method} successfully ran" + + +@pytest.mark.parametrize( + "dimensionality_reduction_method", + [ + PCA(n_components=5), + TruncatedSVD(n_components=5), + umap.UMAP(n_components=5), + TSNE(n_components=2), + ], +) +def test_dimensionality_reduction( + dimensionality_reduction_method, classification_data +): + X_train, X_test, y_train, y_test = classification_data + X_transformed = dimensionality_reduction_method.fit_transform(X_train) + assert ( + X_transformed.shape[1] <= 5 + ), f"{dimensionality_reduction_method} successfully reduced dimensions" + + +def test_nearest_neighbors(classification_data): + X_train, X_test, y_train, y_test = classification_data + nearest_neighbors = NearestNeighbors(n_neighbors=5) + nearest_neighbors.fit(X_train) + assert True, "NearestNeighbors successfully ran" diff --git a/python/cuml/cuml/tests/experimental/accel/test_pipeline.py b/python/cuml/cuml/tests/experimental/accel/test_pipeline.py new file mode 100644 index 0000000000..ec9a1f1583 --- /dev/null +++ b/python/cuml/cuml/tests/experimental/accel/test_pipeline.py @@ -0,0 +1,165 @@ +# +# Copyright (c) 2024, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +import pytest +from sklearn.decomposition import PCA, TruncatedSVD +from sklearn.cluster import KMeans, DBSCAN +from sklearn.kernel_ridge import KernelRidge +from sklearn.linear_model import ( + LogisticRegression, + LinearRegression, + ElasticNet, + Ridge, + Lasso, +) +from sklearn.manifold import TSNE +from sklearn.neighbors import ( + NearestNeighbors, + KNeighborsClassifier, + KNeighborsRegressor, +) +from sklearn.pipeline import Pipeline +from sklearn.datasets import make_classification, make_regression +from sklearn.model_selection import train_test_split +from sklearn.metrics import accuracy_score, mean_squared_error +from umap import UMAP +import hdbscan +import numpy as np + + +@pytest.fixture +def classification_data(): + # Create a synthetic dataset for binary classification + X, y = make_classification(n_samples=100, n_features=20, random_state=42) + return train_test_split(X, y, test_size=0.2, random_state=42) + + +@pytest.fixture +def regression_data(): + # Create a synthetic dataset for regression + X, y = make_regression( + n_samples=100, n_features=20, noise=0.1, random_state=42 + ) + return train_test_split(X, y, test_size=0.2, random_state=42) + + +classification_estimators = [ + LogisticRegression(), + KNeighborsClassifier(), +] + +regression_estimators = [ + LinearRegression(), + Ridge(), + Lasso(), + ElasticNet(), + KernelRidge(), + KNeighborsRegressor(), +] + + +@pytest.mark.parametrize( + "transformer", + [ + PCA(n_components=5), + TruncatedSVD(n_components=5), + KMeans(n_clusters=5, random_state=42), + ], +) +@pytest.mark.parametrize("estimator", classification_estimators) +def test_classification_transformers( + transformer, estimator, classification_data +): + X_train, X_test, y_train, y_test = classification_data + # Create pipeline with the transformer and estimator + pipeline = Pipeline( + [("transformer", transformer), ("classifier", estimator)] + ) + # Fit and predict + pipeline.fit(X_train, y_train) + y_pred = pipeline.predict(X_test) + # Ensure that the result is binary or multiclass classification + + +@pytest.mark.parametrize( + "transformer", + [ + PCA(n_components=5), + TruncatedSVD(n_components=5), + KMeans(n_clusters=5, random_state=42), + ], +) +@pytest.mark.parametrize("estimator", regression_estimators) +def test_regression_transformers(transformer, estimator, regression_data): + X_train, X_test, y_train, y_test = regression_data + # Create pipeline with the transformer and estimator + pipeline = Pipeline( + [("transformer", transformer), ("regressor", estimator)] + ) + # Fit and predict + pipeline.fit(X_train, y_train) + y_pred = pipeline.predict(X_test) + # Ensure that the result has a reasonably low mean squared error + + +@pytest.mark.parametrize( + "transformer", + [ + PCA(n_components=5), + TruncatedSVD(n_components=5), + KMeans(n_clusters=5, random_state=42), + ], +) +@pytest.mark.parametrize("estimator", [NearestNeighbors(), DBSCAN()]) +def test_unsupervised_neighbors(transformer, estimator, classification_data): + X_train, X_test, _, _ = classification_data + # Create pipeline with the transformer and unsupervised model + pipeline = Pipeline( + [("transformer", transformer), ("unsupervised", estimator)] + ) + # Fit the model (no predict needed for unsupervised learning) + pipeline.fit(X_train) + + +def test_umap_with_logistic_regression(data): + X_train, X_test, y_train, y_test = data + # Create pipeline with UMAP for dimensionality reduction and logistic regression + pipeline = Pipeline( + [ + ("umap", UMAP(n_components=5, random_state=42)), + ("classifier", LogisticRegression()), + ] + ) + # Fit and predict + pipeline.fit(X_train, y_train) + y_pred = pipeline.predict(X_test) + # Check accuracy + + +def test_hdbscan_with_logistic_regression(data): + X_train, X_test, y_train, y_test = data + # Create pipeline with HDBSCAN for clustering and logistic regression + # HDBSCAN outputs labels as features + pipeline = Pipeline( + [ + ("hdbscan", hdbscan.HDBSCAN(min_cluster_size=5)), + ("classifier", LogisticRegression()), + ] + ) + # Fit and predict + pipeline.fit(X_train, y_train) + y_pred = pipeline.predict(X_test) + # Check accuracy