Make get_param_names a class method on single GPU estimators to mat…

…ch Scikit-learn closer (#6101) Small difference between our estimators and Scikit-learn is that `get_param_names` are a classmethod in sklearn, and not in ours. This can make a few corner cases fail for using our estimators when Scikit-learn like estimators are expected. This PR fixes that. **Note:** This will not include dask-based estimators for the time being since they depend on introspection at object creation time. Authors: - Dante Gama Dessavre (https://github.com/dantegd) - Divye Gala (https://github.com/divyegala) Approvers: - William Hicks (https://github.com/wphicks) URL: #6101
rapidsai · Nov 12, 2024 · 8e195fb · 8e195fb
1 parent 22c3ee8
commit 8e195fb
Show file tree

Hide file tree

Showing 54 changed files with 238 additions and 174 deletions.
diff --git a/python/cuml/cuml/_thirdparty/sklearn/preprocessing/_data.py b/python/cuml/cuml/_thirdparty/sklearn/preprocessing/_data.py
@@ -326,8 +326,9 @@ def _reset(self):
             del self.data_max_
             del self.data_range_
 
-    def get_param_names(self):
-        return super().get_param_names() + [
+    @classmethod
+    def _get_param_names(cls):
+        return super()._get_param_names() + [
             "feature_range",
             "copy"
         ]
@@ -651,8 +652,9 @@ def _reset(self):
             del self.mean_
             del self.var_
 
-    def get_param_names(self):
-        return super().get_param_names() + [
+    @classmethod
+    def _get_param_names(cls):
+        return super()._get_param_names() + [
             "with_mean",
             "with_std",
             "copy"
@@ -955,8 +957,9 @@ def _reset(self):
             del self.n_samples_seen_
             del self.max_abs_
 
-    def get_param_names(self):
-        return super().get_param_names() + [
+    @classmethod
+    def _get_param_names(cls):
+        return super()._get_param_names() + [
             "copy"
         ]
 
@@ -1205,8 +1208,9 @@ def __init__(self, *, with_centering=True, with_scaling=True,
         self.quantile_range = quantile_range
         self.copy = copy
 
-    def get_param_names(self):
-        return super().get_param_names() + [
+    @classmethod
+    def _get_param_names(cls):
+        return super()._get_param_names() + [
             "with_centering",
             "with_scaling",
             "quantile_range",
@@ -1478,8 +1482,9 @@ def __init__(self, degree=2, *, interaction_only=False, include_bias=True,
         self.include_bias = include_bias
         self.order = order
 
-    def get_param_names(self):
-        return super().get_param_names() + [
+    @classmethod
+    def _get_param_names(cls):
+        return super()._get_param_names() + [
             "degree",
             "interaction_only",
             "include_bias",
@@ -2273,8 +2278,9 @@ def __init__(self, *, n_quantiles=1000, output_distribution='uniform',
         self.random_state = random_state
         self.copy = copy
 
-    def get_param_names(self):
-        return super().get_param_names() + [
+    @classmethod
+    def _get_param_names(cls):
+        return super()._get_param_names() + [
             "n_quantiles",
             "output_distribution",
             "ignore_implicit_zeros",
@@ -2797,8 +2803,9 @@ def __init__(self, method='yeo-johnson', *, standardize=True, copy=True):
         self.standardize = standardize
         self.copy = copy
 
-    def get_param_names(self):
-        return super().get_param_names() + [
+    @classmethod
+    def _get_param_names(cls):
+        return super()._get_param_names() + [
             "method",
             "standardize",
             "copy"

diff --git a/python/cuml/cuml/_thirdparty/sklearn/preprocessing/_discretization.py b/python/cuml/cuml/_thirdparty/sklearn/preprocessing/_discretization.py
@@ -158,8 +158,9 @@ def __init__(self, n_bins=5, *, encode='onehot', strategy='quantile'):
         self.encode = encode
         self.strategy = strategy
 
-    def get_param_names(self):
-        return super().get_param_names() + [
+    @classmethod
+    def _get_param_names(cls):
+        return super()._get_param_names() + [
             "n_bins",
             "encode",
             "strategy"

diff --git a/python/cuml/cuml/_thirdparty/sklearn/preprocessing/_imputation.py b/python/cuml/cuml/_thirdparty/sklearn/preprocessing/_imputation.py
@@ -243,8 +243,9 @@ def __init__(self, *, missing_values=np.nan, strategy="mean",
         self.fill_value = fill_value
         self.copy = copy
 
-    def get_param_names(self):
-        return super().get_param_names() + [
+    @classmethod
+    def _get_param_names(cls):
+        return super()._get_param_names() + [
             "strategy",
             "fill_value",
             "verbose",
@@ -544,8 +545,9 @@ def __init__(self, *, missing_values=np.nan, features="missing-only",
         self.sparse = sparse
         self.error_on_new = error_on_new
 
-    def get_param_names(self):
-        return super().get_param_names() + [
+    @classmethod
+    def _get_param_names(cls):
+        return super()._get_param_names() + [
             "missing_values",
             "features",
             "sparse",

diff --git a/python/cuml/cuml/cluster/agglomerative.pyx b/python/cuml/cuml/cluster/agglomerative.pyx
@@ -279,8 +279,9 @@ class AgglomerativeClustering(Base, ClusterMixin, CMajorInputTagMixin):
         """
         return self.fit(X).labels_
 
-    def get_param_names(self):
-        return super().get_param_names() + [
+    @classmethod
+    def _get_param_names(cls):
+        return super()._get_param_names() + [
             "n_clusters",
             "affinity",
             "metric",

diff --git a/python/cuml/cuml/cluster/dbscan.pyx b/python/cuml/cuml/cluster/dbscan.pyx
@@ -466,8 +466,9 @@ class DBSCAN(UniversalBase,
         self.fit(X, out_dtype, sample_weight)
         return self.labels_
 
-    def get_param_names(self):
-        return super().get_param_names() + [
+    @classmethod
+    def _get_param_names(cls):
+        return super()._get_param_names() + [
             "eps",
             "min_samples",
             "max_mbytes_per_batch",

diff --git a/python/cuml/cuml/cluster/hdbscan/hdbscan.pyx b/python/cuml/cuml/cluster/hdbscan/hdbscan.pyx
@@ -1112,8 +1112,9 @@ class HDBSCAN(UniversalBase, ClusterMixin, CMajorInputTagMixin):
 
             self._cpu_to_gpu_interop_prepped = True
 
-    def get_param_names(self):
-        return super().get_param_names() + [
+    @classmethod
+    def _get_param_names(cls):
+        return super()._get_param_names() + [
             "metric",
             "min_cluster_size",
             "max_cluster_size",

diff --git a/python/cuml/cuml/cluster/kmeans.pyx b/python/cuml/cuml/cluster/kmeans.pyx
@@ -696,8 +696,9 @@ class KMeans(UniversalBase,
         self.fit(X, sample_weight=sample_weight)
         return self.transform(X, convert_dtype=convert_dtype)
 
-    def get_param_names(self):
-        return super().get_param_names() + \
+    @classmethod
+    def _get_param_names(cls):
+        return super()._get_param_names() + \
             ['n_init', 'oversampling_factor', 'max_samples_per_batch',
                 'init', 'max_iter', 'n_clusters', 'random_state',
                 'tol', "convert_dtype"]

diff --git a/python/cuml/cuml/dask/cluster/dbscan.py b/python/cuml/cuml/dask/cluster/dbscan.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2020-2023, NVIDIA CORPORATION.
+# Copyright (c) 2020-2024, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -160,5 +160,5 @@ def fit_predict(self, X, out_dtype="int32"):
         self.fit(X, out_dtype)
         return self.get_combined_model().labels_
 
-    def get_param_names(self):
+    def _get_param_names(self):
         return list(self.kwargs.keys())
diff --git a/python/cuml/cuml/dask/cluster/kmeans.py b/python/cuml/cuml/dask/cluster/kmeans.py
@@ -302,5 +302,5 @@ def score(self, X, sample_weight=None):
             cp.asarray(self.client.compute(scores, sync=True)) * -1.0
         )
 
-    def get_param_names(self):
+    def _get_param_names(self):
         return list(self.kwargs.keys())
diff --git a/python/cuml/cuml/dask/decomposition/pca.py b/python/cuml/cuml/dask/decomposition/pca.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2019-2023, NVIDIA CORPORATION.
+# Copyright (c) 2019-2024, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -215,7 +215,7 @@ def inverse_transform(self, X, delayed=True):
         """
         return self._inverse_transform(X, n_dims=2, delayed=delayed)
 
-    def get_param_names(self):
+    def _get_param_names(self):
         return list(self.kwargs.keys())
 
     @staticmethod

diff --git a/python/cuml/cuml/dask/decomposition/tsvd.py b/python/cuml/cuml/dask/decomposition/tsvd.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2019-2023, NVIDIA CORPORATION.
+# Copyright (c) 2019-2024, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -177,7 +177,7 @@ def inverse_transform(self, X, delayed=True):
         """
         return self._inverse_transform(X, n_dims=2, delayed=delayed)
 
-    def get_param_names(self):
+    def _get_param_names(self):
         return list(self.kwargs.keys())
 
     @staticmethod

diff --git a/python/cuml/cuml/dask/linear_model/linear_regression.py b/python/cuml/cuml/dask/linear_model/linear_regression.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2019-2023, NVIDIA CORPORATION.
+# Copyright (c) 2019-2024, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -106,7 +106,7 @@ def predict(self, X, delayed=True):
         """
         return self._predict(X, delayed=delayed)
 
-    def get_param_names(self):
+    def _get_param_names(self):
         return list(self.kwargs.keys())
 
     @staticmethod

diff --git a/python/cuml/cuml/dask/linear_model/ridge.py b/python/cuml/cuml/dask/linear_model/ridge.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2019-2023, NVIDIA CORPORATION.
+# Copyright (c) 2019-2024, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -114,7 +114,7 @@ def predict(self, X, delayed=True):
         """
         return self._predict(X, delayed=delayed)
 
-    def get_param_names(self):
+    def _get_param_names(self):
         return list(self.kwargs.keys())
 
     @staticmethod

diff --git a/python/cuml/cuml/decomposition/incremental_pca.py b/python/cuml/cuml/decomposition/incremental_pca.py
@@ -216,7 +216,6 @@ def __init__(
             output_type=output_type,
         )
         self.batch_size = batch_size
-        self._hyperparams = ["n_components", "whiten", "copy", "batch_size"]
         self._sparse_model = True
 
     def fit(self, X, y=None, convert_dtype=True) -> "IncrementalPCA":
@@ -449,9 +448,15 @@ def transform(self, X, convert_dtype=False) -> CumlArray:
         else:
             return super().transform(X)
 
-    def get_param_names(self):
+    @classmethod
+    def _get_param_names(cls):
         # Skip super() since we dont pass any extra parameters in __init__
-        return Base.get_param_names(self) + self._hyperparams
+        return Base._get_param_names() + [
+            "n_components",
+            "whiten",
+            "copy",
+            "batch_size",
+        ]
 
 
 def _validate_sparse_input(X):

diff --git a/python/cuml/cuml/decomposition/pca.pyx b/python/cuml/cuml/decomposition/pca.pyx
@@ -725,8 +725,9 @@ class PCA(UniversalBase,
 
             return t_input_data
 
-    def get_param_names(self):
-        return super().get_param_names() + \
+    @classmethod
+    def _get_param_names(cls):
+        return super()._get_param_names() + \
             ["copy", "iterated_power", "n_components", "svd_solver", "tol",
                 "whiten", "random_state"]
 

diff --git a/python/cuml/cuml/decomposition/tsvd.pyx b/python/cuml/cuml/decomposition/tsvd.pyx
@@ -482,8 +482,9 @@ class TruncatedSVD(UniversalBase,
 
             return t_input_data
 
-    def get_param_names(self):
-        return super().get_param_names() + \
+    @classmethod
+    def _get_param_names(cls):
+        return super()._get_param_names() + \
             ["algorithm", "n_components", "n_iter", "random_state", "tol"]
 
     def get_attr_names(self):

diff --git a/python/cuml/cuml/ensemble/randomforest_common.pyx b/python/cuml/cuml/ensemble/randomforest_common.pyx
@@ -382,8 +382,9 @@ class BaseRandomForestModel(Base):
             preds = tl_to_fil_model.predict(X)
         return preds
 
-    def get_param_names(self):
-        return super().get_param_names() + BaseRandomForestModel._param_names
+    @classmethod
+    def _get_param_names(cls):
+        return super()._get_param_names() + BaseRandomForestModel._param_names
 
     def set_params(self, **params):
         self.treelite_serialized_model = None

diff --git a/python/cuml/cuml/experimental/linear_model/lars.pyx b/python/cuml/cuml/experimental/linear_model/lars.pyx
@@ -397,7 +397,8 @@ class Lars(Base, RegressorMixin):
 
         return preds
 
-    def get_param_names(self):
-        return super().get_param_names() + \
+    @classmethod
+    def _get_param_names(cls):
+        return super()._get_param_names() + \
             ['copy_X', 'fit_intercept', 'fit_path', 'n_nonzero_coefs',
              'normalize', 'precompute', 'eps']
diff --git a/python/cuml/cuml/feature_extraction/_tfidf.py b/python/cuml/cuml/feature_extraction/_tfidf.py
@@ -1,5 +1,5 @@
 #
-# Copyright (c) 2019-2023, NVIDIA CORPORATION.
+# Copyright (c) 2019-2024, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -301,8 +301,9 @@ def idf_(self, value):
             (value, 0), shape=(n_features, n_features), dtype=cp.float32
         )
 
-    def get_param_names(self):
-        return super().get_param_names() + [
+    @classmethod
+    def _get_param_names(cls):
+        return super()._get_param_names() + [
             "norm",
             "use_idf",
             "smooth_idf",

diff --git a/python/cuml/cuml/internals/base.pyx b/python/cuml/cuml/internals/base.pyx
@@ -1,5 +1,5 @@
 #
-# Copyright (c) 2019-2023, NVIDIA CORPORATION.
+# Copyright (c) 2019-2024, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -182,7 +182,8 @@ class Base(TagsMixin,
                 self._check_output_type(data)
                 # inference logic goes here
 
-            def get_param_names(self):
+            @classmethod
+            def _get_param_names(cls):
                 # return a list of hyperparam names supported by this algo
 
         # stream and handle example:
@@ -270,7 +271,8 @@ class Base(TagsMixin,
             output += ' <sk_model_ attribute used>'
         return output
 
-    def get_param_names(self):
+    @classmethod
+    def _get_param_names(cls):
         """
         Returns a list of hyperparameter names owned by this class. It is
         expected that every child class overrides this method and appends its
@@ -282,12 +284,12 @@ class Base(TagsMixin,
     def get_params(self, deep=True):
         """
         Returns a dict of all params owned by this class. If the child class
-        has appropriately overridden the `get_param_names` method and does not
+        has appropriately overridden the `_get_param_names` method and does not
         need anything other than what is there in this method, then it doesn't
         have to override this method
         """
         params = dict()
-        variables = self.get_param_names()
+        variables = self._get_param_names()
         for key in variables:
             var_value = getattr(self, key, None)
             params[key] = var_value
@@ -297,12 +299,12 @@ class Base(TagsMixin,
         """
         Accepts a dict of params and updates the corresponding ones owned by
         this class. If the child class has appropriately overridden the
-        `get_param_names` method and does not need anything other than what is,
+        `_get_param_names` method and does not need anything other than what is,
         there in this method, then it doesn't have to override this method
         """
         if not params:
             return self
-        variables = self.get_param_names()
+        variables = self._get_param_names()
         for key, value in params.items():
             if key not in variables:
                 raise ValueError("Bad param '%s' passed to set_params" % key)