diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index bd25832ab7..30a7443829 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -9,7 +9,7 @@ repos: files: python/.* args: [--config, python/cuml/pyproject.toml] - repo: https://github.com/PyCQA/flake8 - rev: 5.0.4 + rev: 7.1.1 hooks: - id: flake8 args: [--config=python/cuml/.flake8] @@ -60,7 +60,7 @@ repos: pass_filenames: false language: python - repo: https://github.com/rapidsai/pre-commit-hooks - rev: v0.3.1 + rev: v0.4.0 hooks: - id: verify-copyright files: | diff --git a/BUILD.md b/BUILD.md index 4bc8310407..059836e57d 100644 --- a/BUILD.md +++ b/BUILD.md @@ -18,7 +18,7 @@ To install cuML from source, ensure the following dependencies are met: It is recommended to use conda for environment/package management. If doing so, development environment .yaml files are located in `conda/environments/all_*.yaml`. These files contains most of the dependencies mentioned above (notable exceptions are `gcc` and `zlib`). To create a development environment named `cuml_dev`, you can use the follow commands: ```bash -conda create -n cuml_dev python=3.11 +conda create -n cuml_dev python=3.12 conda env update -n cuml_dev --file=conda/environments/all_cuda-118_arch-x86_64.yaml conda activate cuml_dev ``` diff --git a/build.sh b/build.sh index 2351834260..9eb36f103c 100755 --- a/build.sh +++ b/build.sh @@ -168,7 +168,7 @@ while true; do CMAKE_LOG_LEVEL=VERBOSE ;; -g | --debug ) - BUILD_TYPE=Debug + BUILD_TYPE=RelWithDebInfo ;; -n | --no-install ) INSTALL_TARGET="" diff --git a/conda/environments/all_cuda-118_arch-x86_64.yaml b/conda/environments/all_cuda-118_arch-x86_64.yaml index e8a39e250e..e7dcb0a323 100644 --- a/conda/environments/all_cuda-118_arch-x86_64.yaml +++ b/conda/environments/all_cuda-118_arch-x86_64.yaml @@ -45,19 +45,20 @@ dependencies: - ninja - nltk - numba>=0.57 +- numpy>=1.23,<3.0a0 - numpydoc - nvcc_linux-64=11.8 - packaging - pip - pydata-sphinx-theme!=0.14.2 - pylibraft==24.10.*,>=0.0.0a0 -- pynndescent==0.5.8 +- pynndescent - pytest-benchmark - pytest-cases - pytest-cov - pytest-xdist - pytest==7.* -- python>=3.9,<3.12 +- python>=3.10,<3.13 - raft-dask==24.10.*,>=0.0.0a0 - rapids-build-backend>=0.3.0,<0.4.0.dev0 - rapids-dask-dependency==24.10.*,>=0.0.0a0 @@ -67,13 +68,14 @@ dependencies: - scikit-learn==1.5 - scipy>=1.8.0 - seaborn +- setuptools - sphinx-copybutton - sphinx-markdown-tables - sphinx<6 - statsmodels - sysroot_linux-64==2.17 - treelite==4.3.0 -- umap-learn==0.5.3 +- umap-learn==0.5.6 - pip: - dask-glm==0.3.0 name: all_cuda-118_arch-x86_64 diff --git a/conda/environments/all_cuda-125_arch-x86_64.yaml b/conda/environments/all_cuda-125_arch-x86_64.yaml index 293028cdb1..2340040085 100644 --- a/conda/environments/all_cuda-125_arch-x86_64.yaml +++ b/conda/environments/all_cuda-125_arch-x86_64.yaml @@ -42,18 +42,19 @@ dependencies: - ninja - nltk - numba>=0.57 +- numpy>=1.23,<3.0a0 - numpydoc - packaging - pip - pydata-sphinx-theme!=0.14.2 - pylibraft==24.10.*,>=0.0.0a0 -- pynndescent==0.5.8 +- pynndescent - pytest-benchmark - pytest-cases - pytest-cov - pytest-xdist - pytest==7.* -- python>=3.9,<3.12 +- python>=3.10,<3.13 - raft-dask==24.10.*,>=0.0.0a0 - rapids-build-backend>=0.3.0,<0.4.0.dev0 - rapids-dask-dependency==24.10.*,>=0.0.0a0 @@ -63,13 +64,14 @@ dependencies: - scikit-learn==1.5 - scipy>=1.8.0 - seaborn +- setuptools - sphinx-copybutton - sphinx-markdown-tables - sphinx<6 - statsmodels - sysroot_linux-64==2.17 - treelite==4.3.0 -- umap-learn==0.5.3 +- umap-learn==0.5.6 - pip: - dask-glm==0.3.0 name: all_cuda-125_arch-x86_64 diff --git a/conda/recipes/cuml-cpu/meta.yaml b/conda/recipes/cuml-cpu/meta.yaml index 09686ff9dd..97e5cdd813 100644 --- a/conda/recipes/cuml-cpu/meta.yaml +++ b/conda/recipes/cuml-cpu/meta.yaml @@ -31,11 +31,11 @@ requirements: - rapids-build-backend>=0.3.0,<0.4.0.dev0 run: - python x.x - - numpy>=1.23,<2.0a0 + - numpy>=1.23,<3.0a0 - pandas - scikit-learn=1.2 - hdbscan>=0.8.38,<0.8.39 - - umap-learn=0.5.3 + - umap-learn=0.5.6 - nvtx tests: # [linux64] diff --git a/conda/recipes/cuml/meta.yaml b/conda/recipes/cuml/meta.yaml index bca0b0378c..74ba26ea14 100644 --- a/conda/recipes/cuml/meta.yaml +++ b/conda/recipes/cuml/meta.yaml @@ -85,6 +85,7 @@ requirements: - joblib >=0.11 - libcuml ={{ version }} - libcumlprims ={{ minor_version }} + - numpy >=1.23,<3.0a0 - pylibraft ={{ minor_version }} - python x.x - raft-dask ={{ minor_version }} diff --git a/cpp/include/cuml/experimental/fil/README.md b/cpp/include/cuml/experimental/fil/README.md index e195e6cd64..48d4a4ab16 100644 --- a/cpp/include/cuml/experimental/fil/README.md +++ b/cpp/include/cuml/experimental/fil/README.md @@ -39,7 +39,7 @@ similar load methods for each of the serialization formats it supports. ```cpp auto filename = "xgboost.json"; -auto tl_model = treelite::frontend::LoadXGBoostModel(filename); +auto tl_model = treelite::model_loader::LoadXGBoostModelJSON(filename, "{}"); ``` We then import the Treelite model into FIL via the diff --git a/cpp/include/cuml/manifold/umap.hpp b/cpp/include/cuml/manifold/umap.hpp index 62a875e685..7de08c5488 100644 --- a/cpp/include/cuml/manifold/umap.hpp +++ b/cpp/include/cuml/manifold/umap.hpp @@ -84,6 +84,27 @@ void refine(const raft::handle_t& handle, UMAPParams* params, float* embeddings); +/** + * Initializes embeddings and performs a UMAP fit on them, which enables + * iterative fitting without callbacks. + * + * @param[in] handle: raft::handle_t + * @param[in] X: pointer to input array + * @param[in] n: n_samples of input array + * @param[in] d: n_features of input array + * @param[in] graph: pointer to raft::sparse::COO object computed using ML::UMAP::get_graph + * @param[in] params: pointer to ML::UMAPParams object + * @param[out] embeddings: pointer to current embedding with shape n * n_components, stores updated + * embeddings on executing refine + */ +void init_and_refine(const raft::handle_t& handle, + float* X, + int n, + int d, + raft::sparse::COO* graph, + UMAPParams* params, + float* embeddings); + /** * Dense fit * diff --git a/cpp/src/fil/treelite_import.cu b/cpp/src/fil/treelite_import.cu index bc3a13abb8..2a584c0095 100644 --- a/cpp/src/fil/treelite_import.cu +++ b/cpp/src/fil/treelite_import.cu @@ -490,10 +490,11 @@ void tl2fil_common(forest_params_t* params, ASSERT(model.num_target == 1, "FIL does not support multi-target models"); // assuming either all leaves use the .leaf_vector() or all leaves use .leaf_value() - size_t leaf_vec_size = tl_leaf_vector_size(model); + std::size_t leaf_vec_size = tl_leaf_vector_size(model); std::string pred_transform(model.postprocessor); if (leaf_vec_size > 0) { - ASSERT(leaf_vec_size == model.num_class[0], "treelite model inconsistent"); + ASSERT(leaf_vec_size == static_cast(model.num_class[0]), + "treelite model inconsistent"); params->num_classes = leaf_vec_size; params->leaf_algo = leaf_algo_t::VECTOR_LEAF; @@ -513,7 +514,8 @@ void tl2fil_common(forest_params_t* params, // Ensure that the trees follow the grove-per-class layout. for (size_t tree_id = 0; tree_id < model_preset.trees.size(); ++tree_id) { ASSERT(model.target_id[tree_id] == 0, "FIL does not support multi-target models"); - ASSERT(model.class_id[tree_id] == tree_id % static_cast(model.num_class[0]), + ASSERT(static_cast(model.class_id[tree_id]) == + tree_id % static_cast(model.num_class[0]), "The tree model is not compatible with FIL; the trees must be laid out " "such that tree i's output contributes towards class (i %% num_class)."); } diff --git a/cpp/src/umap/runner.cuh b/cpp/src/umap/runner.cuh index 41bac31678..0ceeb3acaa 100644 --- a/cpp/src/umap/runner.cuh +++ b/cpp/src/umap/runner.cuh @@ -247,12 +247,31 @@ void _refine(const raft::handle_t& handle, value_t* embeddings) { cudaStream_t stream = handle.get_stream(); + ML::Logger::get().setLevel(params->verbosity); + /** * Run simplicial set embedding to approximate low-dimensional representation */ SimplSetEmbed::run(inputs.n, inputs.d, graph, params, embeddings, stream); } +template +void _init_and_refine(const raft::handle_t& handle, + const umap_inputs& inputs, + UMAPParams* params, + raft::sparse::COO* graph, + value_t* embeddings) +{ + cudaStream_t stream = handle.get_stream(); + ML::Logger::get().setLevel(params->verbosity); + + // Initialize embeddings + InitEmbed::run(handle, inputs.n, inputs.d, graph, params, embeddings, stream, params->init); + + // Run simplicial set embedding + SimplSetEmbed::run(inputs.n, inputs.d, graph, params, embeddings, stream); +} + template void _fit(const raft::handle_t& handle, const umap_inputs& inputs, diff --git a/cpp/src/umap/umap.cu b/cpp/src/umap/umap.cu index 86799ae6bc..899051f8de 100644 --- a/cpp/src/umap/umap.cu +++ b/cpp/src/umap/umap.cu @@ -92,6 +92,20 @@ void refine(const raft::handle_t& handle, handle, inputs, params, graph, embeddings); } +void init_and_refine(const raft::handle_t& handle, + float* X, + int n, + int d, + raft::sparse::COO* graph, + UMAPParams* params, + float* embeddings) +{ + CUML_LOG_DEBUG("Calling UMAP::init_and_refine() with precomputed KNN"); + manifold_dense_inputs_t inputs(X, nullptr, n, d); + UMAPAlgo::_init_and_refine, TPB_X>( + handle, inputs, params, graph, embeddings); +} + void fit(const raft::handle_t& handle, float* X, float* y, diff --git a/dependencies.yaml b/dependencies.yaml index 8c2508ce6b..23a72d1db8 100644 --- a/dependencies.yaml +++ b/dependencies.yaml @@ -229,6 +229,7 @@ dependencies: - dask-cuda==24.10.*,>=0.0.0a0 - joblib>=0.11 - numba>=0.57 + - numpy>=1.23,<3.0a0 # TODO: Is scipy really a hard dependency, or should # we make it optional (i.e. an extra for pip # installation/run_constrained for conda)? @@ -462,10 +463,6 @@ dependencies: specific: - output_types: conda matrices: - - matrix: - py: "3.9" - packages: - - python=3.9 - matrix: py: "3.10" packages: @@ -474,9 +471,13 @@ dependencies: py: "3.11" packages: - python=3.11 + - matrix: + py: "3.12" + packages: + - python=3.12 - matrix: packages: - - python>=3.9,<3.12 + - python>=3.10,<3.13 test_libcuml: common: - output_types: conda @@ -512,8 +513,9 @@ dependencies: - seaborn - *scikit_learn - statsmodels - - umap-learn==0.5.3 - - pynndescent==0.5.8 + - umap-learn==0.5.6 + - pynndescent + - setuptools # Needed on Python 3.12 for dask-glm, which requires pkg_resources but Python 3.12 doesn't have setuptools by default - output_types: conda packages: - pip diff --git a/python/cuml/cuml/common/kernel_utils.py b/python/cuml/cuml/common/kernel_utils.py index 86d6ad831a..89a861060e 100644 --- a/python/cuml/cuml/common/kernel_utils.py +++ b/python/cuml/cuml/common/kernel_utils.py @@ -1,5 +1,5 @@ # -# Copyright (c) 2019-2023, NVIDIA CORPORATION. +# Copyright (c) 2019-2024, NVIDIA CORPORATION. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -101,11 +101,9 @@ def cuda_kernel_factory(nvrtc_kernel_str, dtypes, kernel_name=None): "{%d}" % idx, dtype_strs[idx] ) - kernel_name = f"""{uuid1() - if kernel_name is None - else kernel_name}_{ - "".join(dtype_strs).replace(" ", "_") - }""" + kernel_name_prefix = uuid1() if kernel_name is None else kernel_name + kernel_name_suffix = "".join(dtype_strs).replace(" ", "_") + kernel_name = f"{kernel_name_prefix}_{kernel_name_suffix}" nvrtc_kernel_str = "%s\nvoid %s%s" % ( extern_prefix, diff --git a/python/cuml/cuml/dask/manifold/umap.py b/python/cuml/cuml/dask/manifold/umap.py index 9af1047050..181bfb0728 100644 --- a/python/cuml/cuml/dask/manifold/umap.py +++ b/python/cuml/cuml/dask/manifold/umap.py @@ -1,4 +1,4 @@ -# Copyright (c) 2020-2023, NVIDIA CORPORATION. +# Copyright (c) 2020-2024, NVIDIA CORPORATION. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -83,9 +83,7 @@ class UMAP(BaseEstimator, DelayedTransformMixin): In addition to these missing features, you should expect to see the final embeddings differing between `cuml.umap` and the reference - UMAP. In particular, the reference UMAP uses an approximate kNN - algorithm for large data sizes while cuml.umap always uses exact - kNN. + UMAP. **Known issue:** If a UMAP model has not yet been fit, it cannot be pickled diff --git a/python/cuml/cuml/internals/available_devices.py b/python/cuml/cuml/internals/available_devices.py index 8110f1b5d1..ee3b9b2500 100644 --- a/python/cuml/cuml/internals/available_devices.py +++ b/python/cuml/cuml/internals/available_devices.py @@ -1,5 +1,5 @@ # -# Copyright (c) 2022-2023, NVIDIA CORPORATION. +# Copyright (c) 2022-2024, NVIDIA CORPORATION. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -16,12 +16,8 @@ from cuml.internals.device_support import GPU_ENABLED from cuml.internals.safe_imports import gpu_only_import_from, UnavailableError -try: - from functools import cache # requires Python >= 3.9 -except ImportError: - from functools import lru_cache - cache = lru_cache(maxsize=None) +from functools import cache def gpu_available_no_context_creation(): diff --git a/python/cuml/cuml/manifold/simpl_set.pyx b/python/cuml/cuml/manifold/simpl_set.pyx index f22f524bf7..b0be2d5de7 100644 --- a/python/cuml/cuml/manifold/simpl_set.pyx +++ b/python/cuml/cuml/manifold/simpl_set.pyx @@ -16,6 +16,7 @@ # distutils: language = c++ +import warnings from cuml.internals.safe_imports import cpu_only_import np = cpu_only_import('numpy') from cuml.internals.safe_imports import gpu_only_import @@ -26,7 +27,7 @@ from cuml.manifold.umap_utils cimport * from cuml.manifold.umap_utils import GraphHolder, find_ab_params, \ metric_parsing -from cuml.internals.input_utils import input_to_cuml_array +from cuml.internals.input_utils import input_to_cuml_array, is_array_like from cuml.internals.array import CumlArray from pylibraft.common.handle cimport handle_t @@ -56,6 +57,14 @@ cdef extern from "cuml/manifold/umap.hpp" namespace "ML::UMAP": UMAPParams* params, float* embeddings) + void init_and_refine(handle_t &handle, + float* X, + int n, + int d, + COO* cgraph_coo, + UMAPParams* params, + float* embeddings) + def fuzzy_simplicial_set(X, n_neighbors, @@ -73,6 +82,7 @@ def fuzzy_simplicial_set(X, locally approximating geodesic distance at each point, creating a fuzzy simplicial set for each such point, and then combining all the local fuzzy simplicial sets into a global one via a fuzzy union. + Parameters ---------- X: array of shape (n_samples, n_features) @@ -212,7 +222,7 @@ def simplicial_set_embedding( initial_alpha=1.0, a=None, b=None, - repulsion_strength=1.0, + gamma=1.0, negative_sample_rate=5, n_epochs=None, init="spectral", @@ -221,6 +231,7 @@ def simplicial_set_embedding( metric_kwds=None, output_metric="euclidean", output_metric_kwds=None, + repulsion_strength=None, convert_dtype=True, verbose=False, ): @@ -228,6 +239,7 @@ def simplicial_set_embedding( initialisation method and then minimizing the fuzzy set cross entropy between the 1-skeletons of the high and low dimensional fuzzy simplicial sets. + Parameters ---------- data: array of shape (n_samples, n_features) @@ -244,7 +256,7 @@ def simplicial_set_embedding( Parameter of differentiable approximation of right adjoint functor b: float Parameter of differentiable approximation of right adjoint functor - repulsion_strength: float + gamma: float Weight to apply to negative samples. negative_sample_rate: int (optional, default 5) The number of negative samples to select per positive sample @@ -260,7 +272,7 @@ def simplicial_set_embedding( How to initialize the low dimensional embedding. Options are: * 'spectral': use a spectral embedding of the fuzzy 1-skeleton * 'random': assign initial embedding positions at random. - * A numpy array of initial embedding positions. + * An array-like with initial embedding positions. random_state: numpy RandomState or equivalent A state capable being used as a numpy random state. metric: string (default='euclidean'). @@ -294,9 +306,6 @@ def simplicial_set_embedding( if output_metric_kwds is None: output_metric_kwds = {} - if init not in ['spectral', 'random']: - raise Exception("Initialization strategy not supported: %d" % init) - if output_metric not in ['euclidean', 'categorical']: raise Exception("Invalid output metric: {}" % output_metric) @@ -320,17 +329,29 @@ def simplicial_set_embedding( cdef UMAPParams* umap_params = new UMAPParams() umap_params.n_components = n_components umap_params.initial_alpha = initial_alpha - umap_params.a = a - umap_params.b = b - umap_params.repulsion_strength = repulsion_strength + umap_params.a = a + umap_params.b = b + + if repulsion_strength: + gamma = repulsion_strength + warnings.simplefilter(action="always", category=FutureWarning) + warnings.warn('Parameter "repulsion_strength" has been' + ' deprecated. It will be removed in version 24.12.' + ' Please use the "gamma" parameter instead.', + FutureWarning) + + umap_params.repulsion_strength = gamma umap_params.negative_sample_rate = negative_sample_rate umap_params.n_epochs = n_epochs - if init == 'spectral': - umap_params.init = 1 - else: # init == 'random' - umap_params.init = 0 umap_params.random_state = random_state umap_params.deterministic = deterministic + if isinstance(init, str): + if init == "random": + umap_params.init = 0 + elif init == 'spectral': + umap_params.init = 1 + else: + raise ValueError("Invalid initialization strategy") try: umap_params.metric = metric_parsing[metric.lower()] except KeyError: @@ -344,7 +365,7 @@ def simplicial_set_embedding( else: # output_metric == 'categorical' umap_params.target_metric = MetricType.CATEGORICAL umap_params.target_weight = output_metric_kwds['p'] \ - if 'p' in output_metric_kwds else 0 + if 'p' in output_metric_kwds else 0.5 umap_params.verbosity = verbose X_m, _, _, _ = \ @@ -365,17 +386,40 @@ def simplicial_set_embedding( handle, graph) - embedding = CumlArray.zeros((X_m.shape[0], n_components), - order="C", dtype=np.float32, - index=X_m.index) - - refine(handle_[0], - X_m.ptr, - X_m.shape[0], - X_m.shape[1], - fss_graph.get(), - umap_params, - embedding.ptr) + if isinstance(init, str): + if init in ['spectral', 'random']: + embedding = CumlArray.zeros((X_m.shape[0], n_components), + order="C", dtype=np.float32, + index=X_m.index) + init_and_refine(handle_[0], + X_m.ptr, + X_m.shape[0], + X_m.shape[1], + fss_graph.get(), + umap_params, + embedding.ptr) + else: + raise ValueError("Invalid initialization strategy") + elif is_array_like(init): + embedding, _, _, _ = \ + input_to_cuml_array(init, + order='C', + convert_to_dtype=(np.float32 if convert_dtype + else None), + check_dtype=np.float32, + check_rows=X_m.shape[0], + check_cols=n_components) + refine(handle_[0], + X_m.ptr, + X_m.shape[0], + X_m.shape[1], + fss_graph.get(), + umap_params, + embedding.ptr) + else: + raise ValueError( + "Initialization not supported. Please provide a valid " + "initialization strategy or a pre-initialized embedding.") free(umap_params) diff --git a/python/cuml/cuml/manifold/t_sne.pyx b/python/cuml/cuml/manifold/t_sne.pyx index 264722af76..d230ee8467 100644 --- a/python/cuml/cuml/manifold/t_sne.pyx +++ b/python/cuml/cuml/manifold/t_sne.pyx @@ -27,10 +27,13 @@ cupy = gpu_only_import('cupy') import cuml.internals from cuml.common.array_descriptor import CumlArrayDescriptor -from cuml.internals.base import Base +from cuml.internals.base import UniversalBase from pylibraft.common.handle cimport handle_t +from cuml.internals.api_decorators import device_interop_preparation +from cuml.internals.api_decorators import enable_device_interop import cuml.internals.logger as logger + from cuml.internals.array import CumlArray from cuml.internals.array_sparse import SparseCumlArray from cuml.common.sparse_utils import is_sparse @@ -115,7 +118,7 @@ cdef extern from "cuml/manifold/tsne.h" namespace "ML": float* kl_div) except + -class TSNE(Base, +class TSNE(UniversalBase, CMajorInputTagMixin): """ t-SNE (T-Distributed Stochastic Neighbor Embedding) is an extremely @@ -263,9 +266,11 @@ class TSNE(Base, """ + _cpu_estimator_import_path = 'sklearn.manifold.TSNE' X_m = CumlArrayDescriptor() embedding_ = CumlArrayDescriptor() + @device_interop_preparation def __init__(self, *, n_components=2, perplexity=30.0, @@ -405,6 +410,7 @@ class TSNE(Base, @generate_docstring(skip_parameters_heading=True, X='dense_sparse', convert_dtype_cast='np.float32') + @enable_device_interop def fit(self, X, convert_dtype=True, knn_graph=None) -> "TSNE": """ Fit X into an embedded space. @@ -444,6 +450,8 @@ class TSNE(Base, if convert_dtype else None)) + self.n_features_in_ = p + if n <= 1: raise ValueError("There needs to be more than 1 sample to build " "nearest the neighbors graph") @@ -561,6 +569,7 @@ class TSNE(Base, low-dimensional space.', 'shape': '(n_samples, n_components)'}) @cuml.internals.api_base_fit_transform() + @enable_device_interop def fit_transform(self, X, convert_dtype=True, knn_graph=None) -> CumlArray: """ @@ -648,6 +657,22 @@ class TSNE(Base, def kl_divergence_(self, value): self._kl_divergence_ = value + @property + def learning_rate_(self): + return self.learning_rate + + @learning_rate_.setter + def learning_rate_(self, value): + self.learning_rate = value + + @property + def n_iter_(self): + return self.n_iter + + @n_iter_.setter + def n_iter_(self, value): + self.n_iter = value + def __del__(self): if hasattr(self, "embedding_"): @@ -690,3 +715,8 @@ class TSNE(Base, "square_distances", "precomputed_knn" ] + + def get_attr_names(self): + return ["embedding", "kl_divergence_", + "n_features_in_", "learning_rate_", + "n_iter_"] diff --git a/python/cuml/cuml/manifold/umap.pyx b/python/cuml/cuml/manifold/umap.pyx index 86933ab31b..260b32ee6b 100644 --- a/python/cuml/cuml/manifold/umap.pyx +++ b/python/cuml/cuml/manifold/umap.pyx @@ -16,9 +16,11 @@ # distutils: language = c++ -from cuml.internals.safe_imports import cpu_only_import +from cuml.internals.safe_imports import cpu_only_import, safe_import_from np = cpu_only_import('numpy') pd = cpu_only_import('pandas') +nearest_neighbors = safe_import_from('umap.umap_', 'nearest_neighbors') +DISCONNECTION_DISTANCES = safe_import_from('umap.umap_', 'DISCONNECTION_DISTANCES') import joblib import warnings @@ -313,9 +315,7 @@ class UMAP(UniversalBase, In addition to these missing features, you should expect to see the final embeddings differing between cuml.umap and the reference - UMAP. In particular, the reference UMAP uses an approximate kNN - algorithm for large data sizes while cuml.umap always uses exact - kNN. + UMAP. References ---------- @@ -627,6 +627,8 @@ class UMAP(UniversalBase, _knn_dists_ptr = knn_dists.ptr _knn_indices_ptr = knn_indices.ptr + self._knn_dists = knn_dists + self._knn_indices = knn_indices self.n_neighbors = min(self.n_rows, self.n_neighbors) @@ -853,6 +855,60 @@ class UMAP(UniversalBase, del X_m return embedding + @property + def _n_neighbors(self): + return self.n_neighbors + + @_n_neighbors.setter + def _n_neighbors(self, value): + self.n_neighbors = value + + @property + def _a(self): + return self.a + + @_a.setter + def _a(self, value): + self.a = value + + @property + def _b(self): + return self.b + + @_b.setter + def _b(self, value): + self.b = value + + @property + def _initial_alpha(self): + return self.learning_rate + + @_initial_alpha.setter + def _initial_alpha(self, value): + self.learning_rate = value + + @property + def _disconnection_distance(self): + self.disconnection_distance = DISCONNECTION_DISTANCES.get(self.metric, np.inf) + return self.disconnection_distance + + @_disconnection_distance.setter + def _disconnection_distance(self, value): + self.disconnection_distance = value + + def gpu_to_cpu(self): + if hasattr(self, 'knn_dists') and hasattr(self, 'knn_indices'): + self._knn_dists = self.knn_dists + self._knn_indices = self.knn_indices + self._knn_search_index = None + elif hasattr(self, '_raw_data'): + self._raw_data = self._raw_data.to_output('numpy') + self._knn_dists, self._knn_indices, self._knn_search_index = \ + nearest_neighbors(self._raw_data, self.n_neighbors, self.metric, + self.metric_kwds, False, self.random_state) + + super().gpu_to_cpu() + def get_param_names(self): return super().get_param_names() + [ "n_neighbors", @@ -883,4 +939,7 @@ class UMAP(UniversalBase, ] def get_attr_names(self): - return ['_raw_data', 'embedding_', '_input_hash', '_small_data'] + return ['_raw_data', 'embedding_', '_input_hash', '_small_data', + '_knn_dists', '_knn_indices', '_knn_search_index', + '_disconnection_distance', '_n_neighbors', '_a', '_b', + '_initial_alpha'] diff --git a/python/cuml/cuml/neighbors/CMakeLists.txt b/python/cuml/cuml/neighbors/CMakeLists.txt index 2938fac779..dbb23550aa 100644 --- a/python/cuml/cuml/neighbors/CMakeLists.txt +++ b/python/cuml/cuml/neighbors/CMakeLists.txt @@ -1,5 +1,5 @@ # ============================================================================= -# Copyright (c) 2022, NVIDIA CORPORATION. +# Copyright (c) 2022-2024, NVIDIA CORPORATION. # # Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except # in compliance with the License. You may obtain a copy of the License at @@ -34,7 +34,3 @@ rapids_cython_create_modules( MODULE_PREFIX neighbors_ ASSOCIATED_TARGETS cuml ) - -foreach(target IN LISTS targets_using_numpy) - target_include_directories(${target} PRIVATE "${Python_NumPy_INCLUDE_DIRS}") -endforeach() diff --git a/python/cuml/cuml/tests/dask/conftest.py b/python/cuml/cuml/tests/dask/conftest.py index 27fb746e1c..bdaf591538 100644 --- a/python/cuml/cuml/tests/dask/conftest.py +++ b/python/cuml/cuml/tests/dask/conftest.py @@ -72,11 +72,17 @@ def pytest_addoption(parser): group = parser.getgroup("Dask cuML Custom Options") group.addoption( - "--run_ucx", action="store_true", help="run _only_ UCX-Py tests" + "--run_ucx", + action="store_true", + default=False, + help="run _only_ UCX-Py tests", ) group.addoption( - "--run_ucxx", action="store_true", help="run _only_ UCXX tests" + "--run_ucxx", + action="store_true", + default=False, + help="run _only_ UCXX tests", ) diff --git a/python/cuml/cuml/tests/dask/test_dask_pca.py b/python/cuml/cuml/tests/dask/test_dask_pca.py index 6ee5ba1d25..7e600d331d 100644 --- a/python/cuml/cuml/tests/dask/test_dask_pca.py +++ b/python/cuml/cuml/tests/dask/test_dask_pca.py @@ -1,4 +1,4 @@ -# Copyright (c) 2019-2023, NVIDIA CORPORATION. +# Copyright (c) 2019-2024, NVIDIA CORPORATION. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -73,7 +73,7 @@ def test_pca_fit(nrows, ncols, n_parts, input_type, client): for attr in all_attr: with_sign = False if attr in ["components_"] else True cuml_res = getattr(cupca, attr) - if type(cuml_res) == np.ndarray: + if type(cuml_res) is np.ndarray: cuml_res = cuml_res.to_numpy() skl_res = getattr(skpca, attr) assert array_equal(cuml_res, skl_res, 1e-1, with_sign=with_sign) diff --git a/python/cuml/cuml/tests/dask/test_dask_tsvd.py b/python/cuml/cuml/tests/dask/test_dask_tsvd.py index 3b7220fb8e..32eb1cd426 100644 --- a/python/cuml/cuml/tests/dask/test_dask_tsvd.py +++ b/python/cuml/cuml/tests/dask/test_dask_tsvd.py @@ -1,4 +1,4 @@ -# Copyright (c) 2019-2023, NVIDIA CORPORATION. +# Copyright (c) 2019-2024, NVIDIA CORPORATION. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -81,7 +81,7 @@ def test_pca_fit(data_info, input_type, client): for attr in all_attr: with_sign = False if attr in ["components_"] else True cuml_res = getattr(cutsvd, attr) - if type(cuml_res) == np.ndarray: + if type(cuml_res) is np.ndarray: cuml_res = cuml_res.to_numpy() skl_res = getattr(sktsvd, attr) if attr == "singular_values_": diff --git a/python/cuml/cuml/tests/test_class_enumerator.py b/python/cuml/cuml/tests/test_class_enumerator.py index 464d3f308d..71087f5273 100644 --- a/python/cuml/cuml/tests/test_class_enumerator.py +++ b/python/cuml/cuml/tests/test_class_enumerator.py @@ -1,4 +1,4 @@ -# Copyright (c) 2020-2023, NVIDIA CORPORATION. +# Copyright (c) 2020-2024, NVIDIA CORPORATION. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -99,7 +99,7 @@ def test_class_enumerator_parameters(): class SomeModule: class SomeClass(cuml.Base): def __eq__(self, other): - return type(other) == type(self) + return type(other) is type(self) models1 = ClassEnumerator(module=SomeModule).get_models() models2 = ClassEnumerator( diff --git a/python/cuml/cuml/tests/test_compose.py b/python/cuml/cuml/tests/test_compose.py index c9fdce35df..310aede7f3 100644 --- a/python/cuml/cuml/tests/test_compose.py +++ b/python/cuml/cuml/tests/test_compose.py @@ -1,4 +1,4 @@ -# Copyright (c) 2021-2023, NVIDIA CORPORATION. +# Copyright (c) 2021-2024, NVIDIA CORPORATION. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -84,7 +84,7 @@ def test_column_transformer( ) ft_X = transformer.fit_transform(X) t_X = transformer.transform(X) - assert type(t_X) == type(X) + assert type(t_X) is type(X) sk_transformers = [ ("scaler", skStandardScaler(), sk_selec1), @@ -135,7 +135,7 @@ def test_column_transformer_sparse( if dataset_density < sparse_threshold: # Sparse input -> sparse output if dataset_density > sparse_threshold # else sparse input -> dense output - assert type(t_X) == type(X) + assert type(t_X) is type(X) sk_transformers = [ ("scaler", skStandardScaler(with_mean=False), [0, 2]), @@ -174,7 +174,7 @@ def test_make_column_transformer(clf_dataset, remainder): # noqa: F811 ft_X = transformer.fit_transform(X) t_X = transformer.transform(X) - assert type(t_X) == type(X) + assert type(t_X) is type(X) transformer = sk_make_column_transformer( (skStandardScaler(), sk_selec1), @@ -210,7 +210,7 @@ def test_make_column_transformer_sparse( if dataset_density < sparse_threshold: # Sparse input -> sparse output if dataset_density > sparse_threshold # else sparse input -> dense output - assert type(t_X) == type(X) + assert type(t_X) is type(X) transformer = sk_make_column_transformer( (skStandardScaler(with_mean=False), [0, 2]), @@ -313,7 +313,7 @@ def test_make_column_selector(): sk_t_X = transformer.fit_transform(X_np) assert_allclose(t_X, sk_t_X) - assert type(t_X) == type(X) + assert type(t_X) is type(X) def test_column_transformer_index(clf_dataset): # noqa: F811 diff --git a/python/cuml/cuml/tests/test_device_selection.py b/python/cuml/cuml/tests/test_device_selection.py index 1da3b0738e..449c032161 100644 --- a/python/cuml/cuml/tests/test_device_selection.py +++ b/python/cuml/cuml/tests/test_device_selection.py @@ -21,7 +21,10 @@ from cuml.neighbors import NearestNeighbors from cuml.metrics import trustworthiness from cuml.metrics import adjusted_rand_score -from cuml.manifold import UMAP +from cuml.manifold import ( + UMAP, + TSNE, +) from cuml.linear_model import ( ElasticNet, Lasso, @@ -48,6 +51,7 @@ from sklearn.cluster import KMeans as skKMeans from sklearn.cluster import DBSCAN as skDBSCAN from sklearn.datasets import make_regression, make_blobs +from sklearn.manifold import TSNE as refTSNE from pytest_cases import fixture_union, fixture from importlib import import_module import inspect @@ -596,8 +600,6 @@ def test_train_cpu_infer_cpu(test_data): def test_train_gpu_infer_cpu(test_data): cuEstimator = test_data["cuEstimator"] - if cuEstimator is UMAP: - pytest.skip("UMAP GPU training CPU inference not yet implemented") model = cuEstimator(**test_data["kwargs"]) with using_device_type("gpu"): @@ -655,8 +657,6 @@ def test_pickle_interop(tmp_path, test_data): pickle_filepath = tmp_path / "model.pickle" cuEstimator = test_data["cuEstimator"] - if cuEstimator is UMAP: - pytest.skip("UMAP GPU training CPU inference not yet implemented") model = cuEstimator(**test_data["kwargs"]) with using_device_type("gpu"): if "y_train" in test_data: @@ -861,6 +861,21 @@ def test_umap_methods(device): assert ref_trust - tol <= trust <= ref_trust + tol +@pytest.mark.parametrize("device", ["cpu", "gpu"]) +def test_tsne_methods(device): + ref_model = refTSNE() + ref_embedding = ref_model.fit_transform(X_train_blob) + ref_trust = trustworthiness(X_train_blob, ref_embedding, n_neighbors=12) + + model = TSNE(n_neighbors=12) + with using_device_type(device): + embedding = model.fit_transform(X_train_blob) + trust = trustworthiness(X_train_blob, embedding, n_neighbors=12) + + tol = 0.02 + assert trust >= ref_trust - tol + + @pytest.mark.parametrize("train_device", ["cpu", "gpu"]) @pytest.mark.parametrize("infer_device", ["cpu", "gpu"]) def test_pca_methods(train_device, infer_device): diff --git a/python/cuml/cuml/tests/test_kernel_ridge.py b/python/cuml/cuml/tests/test_kernel_ridge.py index d5534b5662..23148e7907 100644 --- a/python/cuml/cuml/tests/test_kernel_ridge.py +++ b/python/cuml/cuml/tests/test_kernel_ridge.py @@ -1,4 +1,4 @@ -# Copyright (c) 2022-2023, NVIDIA CORPORATION. +# Copyright (c) 2022-2024, NVIDIA CORPORATION. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -46,7 +46,7 @@ def gradient_norm(model, X, y, K, sw=None): ).reshape(y.shape) # initialise to NaN in case below loop has 0 iterations - grads = cp.full_like(y, np.NAN) + grads = cp.full_like(y, np.nan) for i, (beta, target, current_alpha) in enumerate( zip(betas.T, y.T, model.alpha) ): diff --git a/python/cuml/cuml/tests/test_preprocessing.py b/python/cuml/cuml/tests/test_preprocessing.py index c341fa2a63..cdbc872ccd 100644 --- a/python/cuml/cuml/tests/test_preprocessing.py +++ b/python/cuml/cuml/tests/test_preprocessing.py @@ -108,8 +108,8 @@ def test_minmax_scaler( scaler = cuMinMaxScaler(feature_range=feature_range, copy=True) t_X = scaler.fit_transform(X) r_X = scaler.inverse_transform(t_X) - assert type(t_X) == type(X) - assert type(r_X) == type(t_X) + assert type(t_X) is type(X) + assert type(r_X) is type(t_X) scaler = skMinMaxScaler(feature_range=feature_range, copy=True) sk_t_X = scaler.fit_transform(X_np) @@ -127,7 +127,7 @@ def test_minmax_scale( X_np, X = clf_dataset t_X = cu_minmax_scale(X, feature_range=feature_range, axis=axis) - assert type(t_X) == type(X) + assert type(t_X) is type(X) sk_t_X = sk_minmax_scale(X_np, feature_range=feature_range, axis=axis) @@ -146,8 +146,8 @@ def test_standard_scaler( ) t_X = scaler.fit_transform(X) r_X = scaler.inverse_transform(t_X) - assert type(t_X) == type(X) - assert type(r_X) == type(t_X) + assert type(t_X) is type(X) + assert type(r_X) is type(t_X) scaler = skStandardScaler( with_mean=with_mean, with_std=with_std, copy=True @@ -168,8 +168,8 @@ def test_standard_scaler_sparse( scaler = cuStandardScaler(with_mean=False, with_std=with_std, copy=True) t_X = scaler.fit_transform(X) r_X = scaler.inverse_transform(t_X) - # assert type(t_X) == type(X) - # assert type(r_X) == type(t_X) + # assert type(t_X) is type(X) + # assert type(r_X) is type(t_X) if cpx.scipy.sparse.issparse(X): assert cpx.scipy.sparse.issparse(t_X) if scipy.sparse.issparse(X): @@ -202,7 +202,7 @@ def test_scale( t_X = cu_scale( X, axis=axis, with_mean=with_mean, with_std=with_std, copy=True ) - assert type(t_X) == type(X) + assert type(t_X) is type(X) sk_t_X = sk_scale( X_np, axis=axis, with_mean=with_mean, with_std=with_std, copy=True @@ -218,7 +218,7 @@ def test_scale_sparse( X_np, X = sparse_clf_dataset t_X = cu_scale(X, with_mean=False, with_std=with_std, copy=True) - # assert type(t_X) == type(X) + # assert type(t_X) is type(X) if cpx.scipy.sparse.issparse(X): assert cpx.scipy.sparse.issparse(t_X) if scipy.sparse.issparse(X): @@ -234,7 +234,7 @@ def test_maxabs_scale(failure_logger, clf_dataset, axis): # noqa: F811 X_np, X = clf_dataset t_X = cu_maxabs_scale(X, axis=axis) - assert type(t_X) == type(X) + assert type(t_X) is type(X) sk_t_X = sk_maxabs_scale(X_np, axis=axis) @@ -247,8 +247,8 @@ def test_maxabs_scaler(failure_logger, clf_dataset): # noqa: F811 scaler = cuMaxAbsScaler(copy=True) t_X = scaler.fit_transform(X) r_X = scaler.inverse_transform(t_X) - assert type(t_X) == type(X) - assert type(r_X) == type(t_X) + assert type(t_X) is type(X) + assert type(r_X) is type(t_X) scaler = skMaxAbsScaler(copy=True) sk_t_X = scaler.fit_transform(X_np) @@ -266,8 +266,8 @@ def test_maxabs_scaler_sparse( scaler = cuMaxAbsScaler(copy=True) t_X = scaler.fit_transform(X) r_X = scaler.inverse_transform(t_X) - # assert type(t_X) == type(X) - # assert type(r_X) == type(t_X) + # assert type(t_X) is type(X) + # assert type(r_X) is type(t_X) if cpx.scipy.sparse.issparse(X): assert cpx.scipy.sparse.issparse(t_X) if scipy.sparse.issparse(X): @@ -291,7 +291,7 @@ def test_normalizer(failure_logger, clf_dataset, norm): # noqa: F811 normalizer = cuNormalizer(norm=norm, copy=True) t_X = normalizer.fit_transform(X) - assert type(t_X) == type(X) + assert type(t_X) is type(X) normalizer = skNormalizer(norm=norm, copy=True) sk_t_X = normalizer.fit_transform(X_np) @@ -310,7 +310,7 @@ def test_normalizer_sparse( normalizer = cuNormalizer(norm=norm, copy=True) t_X = normalizer.fit_transform(X) - # assert type(t_X) == type(X) + # assert type(t_X) is type(X) if cpx.scipy.sparse.issparse(X): assert cpx.scipy.sparse.issparse(t_X) if scipy.sparse.issparse(X): @@ -344,7 +344,7 @@ def test_normalize( X_np, axis=axis, norm=norm, return_norm=return_norm ) - assert type(t_X) == type(X) + assert type(t_X) is type(X) assert_allclose(t_X, sk_t_X) @@ -357,7 +357,7 @@ def test_normalize_sparse( axis = 0 if X.format == "csc" else 1 t_X = cu_normalize(X, axis=axis, norm=norm) - # assert type(t_X) == type(X) + # assert type(t_X) is type(X) if cpx.scipy.sparse.issparse(X): assert cpx.scipy.sparse.issparse(t_X) if scipy.sparse.issparse(X): @@ -399,7 +399,7 @@ def test_imputer( add_indicator=add_indicator, ) t_X = imputer.fit_transform(X) - assert type(t_X) == type(X) + assert type(t_X) is type(X) imputer = skSimpleImputer( copy=True, @@ -431,7 +431,7 @@ def test_imputer_sparse(sparse_imputer_dataset, strategy): # noqa: F811 fill_value=fill_value, ) t_X = imputer.fit_transform(X) - # assert type(t_X) == type(X) + # assert type(t_X) is type(X) if cpx.scipy.sparse.issparse(X): assert cpx.scipy.sparse.issparse(t_X) if scipy.sparse.issparse(X): @@ -468,7 +468,7 @@ def test_poly_features( include_bias=include_bias, ) t_X = polyfeatures.fit_transform(X) - assert type(X) == type(t_X) + assert type(X) is type(t_X) cu_feature_names = polyfeatures.get_feature_names() if isinstance(t_X, np.ndarray): @@ -510,7 +510,7 @@ def test_poly_features_sparse( include_bias=include_bias, ) t_X = polyfeatures.fit_transform(X) - # assert type(t_X) == type(X) + # assert type(t_X) is type(X) if cpx.scipy.sparse.issparse(X): assert cpx.scipy.sparse.issparse(t_X) if scipy.sparse.issparse(X): @@ -531,7 +531,7 @@ def test_add_dummy_feature(failure_logger, clf_dataset, value): # noqa: F811 X_np, X = clf_dataset t_X = cu_add_dummy_feature(X, value=value) - assert type(t_X) == type(X) + assert type(t_X) is type(X) sk_t_X = sk_add_dummy_feature(X_np, value=value) assert_allclose(t_X, sk_t_X) @@ -544,7 +544,7 @@ def test_add_dummy_feature_sparse( X_np, X = sparse_dataset_with_coo t_X = cu_add_dummy_feature(X, value=value) - # assert type(t_X) == type(X) + # assert type(t_X) is type(X) if cpx.scipy.sparse.issparse(X): assert cpx.scipy.sparse.issparse(t_X) if scipy.sparse.issparse(X): @@ -559,7 +559,7 @@ def test_binarize(failure_logger, clf_dataset, threshold): # noqa: F811 X_np, X = clf_dataset t_X = cu_binarize(X, threshold=threshold, copy=True) - assert type(t_X) == type(X) + assert type(t_X) is type(X) sk_t_X = sk_binarize(X_np, threshold=threshold, copy=True) @@ -573,7 +573,7 @@ def test_binarize_sparse( X_np, X = sparse_clf_dataset t_X = cu_binarize(X, threshold=threshold, copy=True) - # assert type(t_X) == type(X) + # assert type(t_X) is type(X) if cpx.scipy.sparse.issparse(X): assert cpx.scipy.sparse.issparse(t_X) if scipy.sparse.issparse(X): @@ -590,7 +590,7 @@ def test_binarizer(failure_logger, clf_dataset, threshold): # noqa: F811 binarizer = cuBinarizer(threshold=threshold, copy=True) t_X = binarizer.fit_transform(X) - assert type(t_X) == type(X) + assert type(t_X) is type(X) binarizer = skBinarizer(threshold=threshold, copy=True) sk_t_X = binarizer.fit_transform(X_np) @@ -606,7 +606,7 @@ def test_binarizer_sparse( binarizer = cuBinarizer(threshold=threshold, copy=True) t_X = binarizer.fit_transform(X) - # assert type(t_X) == type(X) + # assert type(t_X) is type(X) if cpx.scipy.sparse.issparse(X): assert cpx.scipy.sparse.issparse(t_X) if scipy.sparse.issparse(X): @@ -638,8 +638,8 @@ def test_robust_scaler( ) t_X = scaler.fit_transform(X) r_X = scaler.inverse_transform(t_X) - assert type(t_X) == type(X) - assert type(r_X) == type(t_X) + assert type(t_X) is type(X) + assert type(r_X) is type(t_X) scaler = skRobustScaler( with_centering=with_centering, @@ -675,8 +675,8 @@ def test_robust_scaler_sparse( ) t_X = scaler.fit_transform(X) r_X = scaler.inverse_transform(t_X) - # assert type(t_X) == type(X) - # assert type(r_X) == type(t_X) + # assert type(t_X) is type(X) + # assert type(r_X) is type(t_X) if cpx.scipy.sparse.issparse(X): assert cpx.scipy.sparse.issparse(t_X) if scipy.sparse.issparse(X): @@ -721,7 +721,7 @@ def test_robust_scale( quantile_range=quantile_range, copy=True, ) - assert type(t_X) == type(X) + assert type(t_X) is type(X) sk_t_X = sk_robust_scale( X_np, @@ -760,7 +760,7 @@ def test_robust_scale_sparse( quantile_range=quantile_range, copy=True, ) - # assert type(t_X) == type(X) + # assert type(t_X) is type(X) if cpx.scipy.sparse.issparse(X): assert cpx.scipy.sparse.issparse(t_X) if scipy.sparse.issparse(X): @@ -814,8 +814,8 @@ def test_kbinsdiscretizer( r_X = transformer.inverse_transform(t_X) if encode != "onehot": - assert type(t_X) == type(X) - assert type(r_X) == type(t_X) + assert type(t_X) is type(X) + assert type(r_X) is type(t_X) transformer = skKBinsDiscretizer( n_bins=n_bins, encode=encode, strategy=strategy @@ -847,10 +847,10 @@ def test_missing_indicator( missing_values=missing_values, features=features ) ft_X = indicator.fit_transform(X) - assert type(ft_X) == type(X) + assert type(ft_X) is type(X) indicator.fit(X) t_X = indicator.transform(X) - assert type(t_X) == type(X) + assert type(t_X) is type(X) indicator = skMissingIndicator( missing_values=missing_values, features=features @@ -875,7 +875,7 @@ def test_missing_indicator_sparse( assert cpx.scipy.sparse.issparse(ft_X) or scipy.sparse.issparse(ft_X) indicator.fit(X) t_X = indicator.transform(X) - # assert type(t_X) == type(X) + # assert type(t_X) is type(X) assert cpx.scipy.sparse.issparse(t_X) or scipy.sparse.issparse(t_X) indicator = skMissingIndicator(features=features, missing_values=1) @@ -895,8 +895,8 @@ def test_function_transformer(clf_dataset): # noqa: F811 ) t_X = transformer.fit_transform(X) r_X = transformer.inverse_transform(t_X) - assert type(t_X) == type(X) - assert type(r_X) == type(t_X) + assert type(t_X) is type(X) + assert type(r_X) is type(t_X) transformer = skFunctionTransformer( func=np.exp, inverse_func=np.log, check_inverse=False @@ -952,9 +952,9 @@ def test_quantile_transformer( copy=True, ) t_X = transformer.fit_transform(X) - assert type(t_X) == type(X) + assert type(t_X) is type(X) r_X = transformer.inverse_transform(t_X) - assert type(r_X) == type(t_X) + assert type(r_X) is type(t_X) quantiles_ = transformer.quantiles_ references_ = transformer.references_ @@ -1063,7 +1063,7 @@ def test_quantile_transform( random_state=42, copy=True, ) - assert type(t_X) == type(X) + assert type(t_X) is type(X) sk_t_X = sk_quantile_transform( X_np, @@ -1090,11 +1090,11 @@ def test_power_transformer( method=method, standardize=standardize, copy=True ) ft_X = transformer.fit_transform(X) - assert type(ft_X) == type(X) + assert type(ft_X) is type(X) t_X = transformer.transform(X) - assert type(t_X) == type(X) + assert type(t_X) is type(X) r_X = transformer.inverse_transform(t_X) - assert type(r_X) == type(t_X) + assert type(r_X) is type(t_X) normalizer = skPowerTransformer( method=method, standardize=standardize, copy=True @@ -1115,7 +1115,7 @@ def test_power_transform( X_np, X = nan_filled_positive t_X = cu_power_transform(X, method=method, standardize=standardize) - assert type(t_X) == type(X) + assert type(t_X) is type(X) sk_t_X = sk_power_transform(X_np, method=method, standardize=standardize) @@ -1129,7 +1129,7 @@ def test_kernel_centerer(): model = cuKernelCenterer() model.fit(K) t_X = model.transform(K, copy=True) - assert type(t_X) == type(X) + assert type(t_X) is type(X) model = skKernelCenterer() sk_t_X = model.fit_transform(K) diff --git a/python/cuml/cuml/tests/test_simpl_set.py b/python/cuml/cuml/tests/test_simpl_set.py index cbc5ebc635..7f55155a9f 100644 --- a/python/cuml/cuml/tests/test_simpl_set.py +++ b/python/cuml/cuml/tests/test_simpl_set.py @@ -24,6 +24,7 @@ import pytest from cuml.datasets import make_blobs from cuml.internals.safe_imports import cpu_only_import +from cuml.metrics import trustworthiness np = cpu_only_import("numpy") cp = gpu_only_import("cupy") @@ -133,7 +134,7 @@ def test_simplicial_set_embedding( metric = "euclidean" initial_alpha = 1.0 a, b = UMAP.find_ab_params(1.0, 0.1) - gamma = 0 + gamma = 1.0 negative_sample_rate = 5 n_epochs = 500 init = "random" @@ -180,7 +181,6 @@ def test_simplicial_set_embedding( cu_fss_graph = cu_fuzzy_simplicial_set( X, n_neighbors, random_state, metric ) - cu_embedding = cu_simplicial_set_embedding( X, cu_fss_graph, @@ -199,7 +199,7 @@ def test_simplicial_set_embedding( output_metric_kwds=output_metric_kwds, ) - ref_embedding = cp.array(ref_embedding) - assert correctness_dense( - ref_embedding, cu_embedding, rtol=0.1, threshold=0.95 - ) + ref_t_score = trustworthiness(X, ref_embedding, n_neighbors=n_neighbors) + t_score = trustworthiness(X, cu_embedding, n_neighbors=n_neighbors) + abs_tol = 0.05 + assert t_score >= ref_t_score - abs_tol diff --git a/python/cuml/cuml/tests/test_umap.py b/python/cuml/cuml/tests/test_umap.py index 219810ba6d..41f47bdaee 100644 --- a/python/cuml/cuml/tests/test_umap.py +++ b/python/cuml/cuml/tests/test_umap.py @@ -337,7 +337,7 @@ def test_umap_data_formats( ) embeds = umap.fit_transform(X) - assert type(embeds) == np.ndarray + assert type(embeds) is np.ndarray @pytest.mark.parametrize("target_metric", ["categorical", "euclidean"]) diff --git a/python/cuml/pyproject.toml b/python/cuml/pyproject.toml index ad8d16e076..8934a0f226 100644 --- a/python/cuml/pyproject.toml +++ b/python/cuml/pyproject.toml @@ -78,7 +78,7 @@ authors = [ { name = "NVIDIA Corporation" }, ] license = { text = "Apache 2.0" } -requires-python = ">=3.9" +requires-python = ">=3.10" dependencies = [ "cudf==24.10.*,>=0.0.0a0", "cupy-cuda11x>=12.0.0", @@ -86,6 +86,7 @@ dependencies = [ "dask-cudf==24.10.*,>=0.0.0a0", "joblib>=0.11", "numba>=0.57", + "numpy>=1.23,<3.0a0", "nvidia-cublas", "nvidia-cufft", "nvidia-curand", @@ -102,9 +103,9 @@ dependencies = [ classifiers = [ "Intended Audience :: Developers", "Programming Language :: Python", - "Programming Language :: Python :: 3.9", "Programming Language :: Python :: 3.10", "Programming Language :: Python :: 3.11", + "Programming Language :: Python :: 3.12", ] [project.optional-dependencies] @@ -116,7 +117,7 @@ test = [ "hypothesis>=6.0,<7", "nltk", "numpydoc", - "pynndescent==0.5.8", + "pynndescent", "pytest-benchmark", "pytest-cases", "pytest-cov", @@ -124,8 +125,9 @@ test = [ "pytest==7.*", "scikit-learn==1.5", "seaborn", + "setuptools", "statsmodels", - "umap-learn==0.5.3", + "umap-learn==0.5.6", ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`. [project.urls] @@ -134,7 +136,7 @@ Documentation = "https://docs.rapids.ai/api/cuml/stable/" [tool.black] line-length = 79 -target-version = ["py39"] +target-version = ["py310"] include = '\.py?$' force-exclude = ''' _stop_words\.py |