diff --git a/ci/build_wheel.sh b/ci/build_wheel.sh
index db8393edeb..af3a4c124b 100755
--- a/ci/build_wheel.sh
+++ b/ci/build_wheel.sh
@@ -18,7 +18,26 @@ rapids-generate-version > ./VERSION
 
 cd ${package_dir}
 
-SKBUILD_CMAKE_ARGS="-DDETECT_CONDA_ENV=OFF;-DDISABLE_DEPRECATION_WARNINGS=ON;-DCPM_cumlprims_mg_SOURCE=${GITHUB_WORKSPACE}/cumlprims_mg/" \
+case "${RAPIDS_CUDA_VERSION}" in
+  12.*)
+    EXCLUDE_ARGS=(
+      --exclude "libcublas.so.12"
+      --exclude "libcublasLt.so.12"
+      --exclude "libcufft.so.11"
+      --exclude "libcurand.so.10"
+      --exclude "libcusolver.so.11"
+      --exclude "libcusparse.so.12"
+      --exclude "libnvJitLink.so.12"
+    )
+    EXTRA_CMAKE_ARGS=";-DUSE_CUDA_MATH_WHEELS=ON"
+    ;;
+  11.*)
+    EXCLUDE_ARGS=()
+    EXTRA_CMAKE_ARGS=";-DUSE_CUDA_MATH_WHEELS=OFF"
+    ;;
+esac
+
+SKBUILD_CMAKE_ARGS="-DDETECT_CONDA_ENV=OFF;-DDISABLE_DEPRECATION_WARNINGS=ON;-DCPM_cumlprims_mg_SOURCE=${GITHUB_WORKSPACE}/cumlprims_mg/${EXTRA_CMAKE_ARGS}" \
   python -m pip wheel . \
     -w dist \
     -vvv \
@@ -26,6 +45,6 @@ SKBUILD_CMAKE_ARGS="-DDETECT_CONDA_ENV=OFF;-DDISABLE_DEPRECATION_WARNINGS=ON;-DC
     --disable-pip-version-check
 
 mkdir -p final_dist
-python -m auditwheel repair -w final_dist dist/*
+python -m auditwheel repair -w final_dist "${EXCLUDE_ARGS[@]}" dist/*
 
 RAPIDS_PY_WHEEL_NAME="cuml_${RAPIDS_PY_CUDA_SUFFIX}" rapids-upload-wheels-to-s3 final_dist
diff --git a/ci/release/update-version.sh b/ci/release/update-version.sh
index 52ed710f34..c953ad0317 100755
--- a/ci/release/update-version.sh
+++ b/ci/release/update-version.sh
@@ -18,7 +18,7 @@ CURRENT_MINOR=$(echo $CURRENT_TAG | awk '{split($0, a, "."); print a[2]}')
 CURRENT_PATCH=$(echo $CURRENT_TAG | awk '{split($0, a, "."); print a[3]}')
 CURRENT_SHORT_TAG=${CURRENT_MAJOR}.${CURRENT_MINOR}
 
-#Get <major>.<minor> for next version
+# Get <major>.<minor> for next version
 NEXT_MAJOR=$(echo $NEXT_FULL_TAG | awk '{split($0, a, "."); print a[1]}')
 NEXT_MINOR=$(echo $NEXT_FULL_TAG | awk '{split($0, a, "."); print a[2]}')
 NEXT_SHORT_TAG=${NEXT_MAJOR}.${NEXT_MINOR}
@@ -58,7 +58,7 @@ for DEP in "${DEPENDENCIES[@]}"; do
     sed_runner "/-.* ${DEP}\(-cu[[:digit:]]\{2\}\)\{0,1\}==/ s/==.*/==${NEXT_SHORT_TAG_PEP440}.*,>=0.0.0a0/g" "${FILE}"
   done
   for FILE in python/*/pyproject.toml; do
-    sed_runner "/\"${DEP}==/ s/==.*\"/==${NEXT_SHORT_TAG_PEP440}.*,>=0.0.0a0\"/g" ${FILE}
+    sed_runner "/\"${DEP}==/ s/==.*\"/==${NEXT_SHORT_TAG_PEP440}.*,>=0.0.0a0\"/g" "${FILE}"
   done
 done
 
diff --git a/conda/environments/all_cuda-118_arch-x86_64.yaml b/conda/environments/all_cuda-118_arch-x86_64.yaml
index e17902c787..e8a39e250e 100644
--- a/conda/environments/all_cuda-118_arch-x86_64.yaml
+++ b/conda/environments/all_cuda-118_arch-x86_64.yaml
@@ -22,7 +22,7 @@ dependencies:
 - doxygen=1.9.1
 - gcc_linux-64=11.*
 - graphviz
-- hdbscan<=0.8.30
+- hdbscan>=0.8.38,<0.8.39
 - hypothesis>=6.0,<7
 - ipykernel
 - ipython
diff --git a/conda/environments/all_cuda-125_arch-x86_64.yaml b/conda/environments/all_cuda-125_arch-x86_64.yaml
index b4d3324501..293028cdb1 100644
--- a/conda/environments/all_cuda-125_arch-x86_64.yaml
+++ b/conda/environments/all_cuda-125_arch-x86_64.yaml
@@ -24,7 +24,7 @@ dependencies:
 - doxygen=1.9.1
 - gcc_linux-64=11.*
 - graphviz
-- hdbscan<=0.8.30
+- hdbscan>=0.8.38,<0.8.39
 - hypothesis>=6.0,<7
 - ipykernel
 - ipython
diff --git a/conda/recipes/cuml-cpu/meta.yaml b/conda/recipes/cuml-cpu/meta.yaml
index e3813db79a..09686ff9dd 100644
--- a/conda/recipes/cuml-cpu/meta.yaml
+++ b/conda/recipes/cuml-cpu/meta.yaml
@@ -34,7 +34,7 @@ requirements:
     - numpy>=1.23,<2.0a0
     - pandas
     - scikit-learn=1.2
-    - hdbscan<=0.8.30
+    - hdbscan>=0.8.38,<0.8.39
     - umap-learn=0.5.3
     - nvtx
 
diff --git a/cpp/src/fil/common.cuh b/cpp/src/fil/common.cuh
index e62df3e21f..fe5ba32496 100644
--- a/cpp/src/fil/common.cuh
+++ b/cpp/src/fil/common.cuh
@@ -309,17 +309,12 @@ struct compute_smem_footprint : dispatch_functor<int> {
   int run(predict_params);
 };
 
-template <int NITEMS,
-          leaf_algo_t leaf_algo,
-          bool cols_in_shmem,
-          bool CATS_SUPPORTED,
-          class storage_type>
-__attribute__((visibility("hidden"))) __global__ void infer_k(storage_type forest,
-                                                              predict_params params);
-
 // infer() calls the inference kernel with the parameters on the stream
 template <typename storage_type>
 void infer(storage_type forest, predict_params params, cudaStream_t stream);
 
+template <typename storage_type>
+void infer_shared_mem_size(predict_params params, int max_shm);
+
 }  // namespace fil
 }  // namespace ML
diff --git a/cpp/src/fil/fil.cu b/cpp/src/fil/fil.cu
index e0d2f8baaf..69b0320e1e 100644
--- a/cpp/src/fil/fil.cu
+++ b/cpp/src/fil/fil.cu
@@ -349,26 +349,6 @@ struct forest {
   cat_sets_device_owner cat_sets_;
 };
 
-template <typename storage_type>
-struct opt_into_arch_dependent_shmem : dispatch_functor<void> {
-  const int max_shm;
-  opt_into_arch_dependent_shmem(int max_shm_) : max_shm(max_shm_) {}
-
-  template <typename KernelParams = KernelTemplateParams<>>
-  void run(predict_params p)
-  {
-    auto kernel = infer_k<KernelParams::N_ITEMS,
-                          KernelParams::LEAF_ALGO,
-                          KernelParams::COLS_IN_SHMEM,
-                          KernelParams::CATS_SUPPORTED,
-                          storage_type>;
-    // p.shm_sz might be > max_shm or < MAX_SHM_STD, but we should not check for either, because
-    // we don't run on both proba_ssp_ and class_ssp_ (only class_ssp_). This should be quick.
-    RAFT_CUDA_TRY(
-      cudaFuncSetAttribute(kernel, cudaFuncAttributeMaxDynamicSharedMemorySize, max_shm));
-  }
-};
-
 template <typename real_t>
 struct dense_forest<dense_node<real_t>> : forest<real_t> {
   using node_t = dense_node<real_t>;
@@ -427,8 +407,9 @@ struct dense_forest<dense_node<real_t>> : forest<real_t> {
                                   h.get_stream()));
 
     // predict_proba is a runtime parameter, and opt-in is unconditional
-    dispatch_on_fil_template_params(opt_into_arch_dependent_shmem<storage<node_t>>(this->max_shm_),
-                                    static_cast<predict_params>(this->class_ssp_));
+    fil::infer_shared_mem_size<storage<node_t>>(static_cast<predict_params>(this->class_ssp_),
+                                                this->max_shm_);
+
     // copy must be finished before freeing the host data
     h.sync_stream();
     h_nodes_.clear();
@@ -491,8 +472,8 @@ struct sparse_forest : forest<typename node_t::real_type> {
       nodes_.data(), nodes, sizeof(node_t) * num_nodes_, cudaMemcpyHostToDevice, h.get_stream()));
 
     // predict_proba is a runtime parameter, and opt-in is unconditional
-    dispatch_on_fil_template_params(opt_into_arch_dependent_shmem<storage<node_t>>(this->max_shm_),
-                                    static_cast<predict_params>(this->class_ssp_));
+    fil::infer_shared_mem_size<storage<node_t>>(static_cast<predict_params>(this->class_ssp_),
+                                                this->max_shm_);
   }
 
   virtual void infer(predict_params params, cudaStream_t stream) override
diff --git a/cpp/src/fil/infer.cu b/cpp/src/fil/infer.cu
index 574a0a37e3..c3bdd1b810 100644
--- a/cpp/src/fil/infer.cu
+++ b/cpp/src/fil/infer.cu
@@ -908,12 +908,38 @@ struct infer_k_storage_template : dispatch_functor<void> {
   }
 };
 
+template <typename storage_type>
+struct opt_into_arch_dependent_shmem : dispatch_functor<void> {
+  const int max_shm;
+  opt_into_arch_dependent_shmem(int max_shm_) : max_shm(max_shm_) {}
+
+  template <typename KernelParams = KernelTemplateParams<>>
+  void run(predict_params p)
+  {
+    auto kernel = infer_k<KernelParams::N_ITEMS,
+                          KernelParams::LEAF_ALGO,
+                          KernelParams::COLS_IN_SHMEM,
+                          KernelParams::CATS_SUPPORTED,
+                          storage_type>;
+    // p.shm_sz might be > max_shm or < MAX_SHM_STD, but we should not check for either, because
+    // we don't run on both proba_ssp_ and class_ssp_ (only class_ssp_). This should be quick.
+    RAFT_CUDA_TRY(
+      cudaFuncSetAttribute(kernel, cudaFuncAttributeMaxDynamicSharedMemorySize, max_shm));
+  }
+};
+
 template <typename storage_type>
 void infer(storage_type forest, predict_params params, cudaStream_t stream)
 {
   dispatch_on_fil_template_params(infer_k_storage_template<storage_type>(forest, stream), params);
 }
 
+template <typename storage_type>
+void infer_shared_mem_size(predict_params params, int max_shm)
+{
+  dispatch_on_fil_template_params(opt_into_arch_dependent_shmem<storage_type>(max_shm), params);
+}
+
 template void infer<dense_storage_f32>(dense_storage_f32 forest,
                                        predict_params params,
                                        cudaStream_t stream);
@@ -930,5 +956,11 @@ template void infer<sparse_storage8>(sparse_storage8 forest,
                                      predict_params params,
                                      cudaStream_t stream);
 
+template void infer_shared_mem_size<dense_storage_f32>(predict_params params, int max_shm);
+template void infer_shared_mem_size<dense_storage_f64>(predict_params params, int max_shm);
+template void infer_shared_mem_size<sparse_storage16_f32>(predict_params params, int max_shm);
+template void infer_shared_mem_size<sparse_storage16_f64>(predict_params params, int max_shm);
+template void infer_shared_mem_size<sparse_storage8>(predict_params params, int max_shm);
+
 }  // namespace fil
 }  // namespace ML
diff --git a/dependencies.yaml b/dependencies.yaml
index 2146264618..e22aec7034 100644
--- a/dependencies.yaml
+++ b/dependencies.yaml
@@ -88,6 +88,7 @@ files:
     extras:
       table: project
     includes:
+      - cuda
       - py_run
   py_test:
     output: pyproject
@@ -176,7 +177,7 @@ dependencies:
     common:
       - output_types: [conda, requirements, pyproject]
         packages:
-          - cython>=3.0.0
+          - &cython cython>=3.0.0
           - &treelite treelite==4.3.0
       - output_types: conda
         packages:
@@ -406,6 +407,26 @@ dependencies:
               - *libcusolver114
               - *libcusparse_dev114
               - *libcusparse114
+      - output_types: pyproject
+        matrices:
+          - matrix:
+              cuda: "12.*"
+            packages:
+              - nvidia-cublas-cu12
+              - nvidia-cufft-cu12
+              - nvidia-curand-cu12
+              - nvidia-cusparse-cu12
+              - nvidia-cusolver-cu12
+          - matrix:
+              cuda: "11.*"
+            packages:
+          - matrix:
+            packages:
+              - nvidia-cublas
+              - nvidia-cufft
+              - nvidia-curand
+              - nvidia-cusparse
+              - nvidia-cusolver
   docs:
     common:
       - output_types: [conda, requirements]
@@ -465,7 +486,9 @@ dependencies:
     common:
       - output_types: [conda, requirements, pyproject]
         packages:
+          - *cython
           - dask-ml
+          - hdbscan>=0.8.38,<0.8.39
           - hypothesis>=6.0,<7
           - nltk
           - numpydoc
@@ -484,14 +507,9 @@ dependencies:
           - pip
           - pip:
               - dask-glm==0.3.0
-          # TODO: remove pin once a release that includes fixes for the error
-          # is released: https://github.com/rapidsai/cuml/issues/5514
-          - hdbscan<=0.8.30
       - output_types: pyproject
         packages:
           - dask-glm==0.3.0
-            # TODO: Can we stop pulling from the master branch now that there was a release in October?
-          - hdbscan @ git+https://github.com/scikit-learn-contrib/hdbscan.git@master
   test_notebooks:
     common:
       - output_types: [conda, requirements]
diff --git a/python/cuml/CMakeLists.txt b/python/cuml/CMakeLists.txt
index f2541f7f04..224525ee58 100644
--- a/python/cuml/CMakeLists.txt
+++ b/python/cuml/CMakeLists.txt
@@ -38,6 +38,7 @@ project(
 option(CUML_UNIVERSAL "Build all cuML Python components." ON)
 option(FIND_CUML_CPP "Search for existing CUML C++ installations before defaulting to local files" OFF)
 option(SINGLEGPU "Disable all mnmg components and comms libraries" OFF)
+option(USE_CUDA_MATH_WHEELS "Use the CUDA math wheels instead of the system libraries" OFF)
 set(CUML_RAFT_CLONE_ON_PIN OFF)
 
 
@@ -72,8 +73,10 @@ include(rapids-cython-core)
 
 set(CUML_PYTHON_TREELITE_TARGET treelite::treelite)
 
-if(NOT ${CUML_CPU})
+if(NOT CUML_CPU)
   if(NOT cuml_FOUND)
+    find_package(CUDAToolkit REQUIRED)
+
     set(BUILD_CUML_TESTS OFF)
     set(BUILD_PRIMS_TESTS OFF)
     set(BUILD_CUML_C_LIBRARY OFF)
@@ -85,11 +88,19 @@ if(NOT ${CUML_CPU})
 
     # Statically link dependencies if building wheels
     set(CUDA_STATIC_RUNTIME ON)
-    set(CUDA_STATIC_MATH_LIBRARIES ON)
     set(CUML_USE_RAFT_STATIC ON)
     set(CUML_USE_FAISS_STATIC ON)
     set(CUML_USE_TREELITE_STATIC ON)
     set(CUML_USE_CUMLPRIMS_MG_STATIC ON)
+    # Link to the CUDA wheels with shared libraries for CUDA 12+
+    if(CUDAToolkit_VERSION VERSION_GREATER_EQUAL 12.0)
+      set(CUDA_STATIC_MATH_LIBRARIES OFF)
+    else()
+      if(USE_CUDA_MATH_WHEELS)
+        message(FATAL_ERROR "Cannot use CUDA math wheels with CUDA < 12.0")
+      endif()
+      set(CUDA_STATIC_MATH_LIBRARIES ON)
+    endif()
     # Don't install the static libs into wheels
     set(CUML_EXCLUDE_RAFT_FROM_ALL ON)
     set(RAFT_EXCLUDE_FAISS_FROM_ALL ON)
@@ -98,6 +109,17 @@ if(NOT ${CUML_CPU})
 
     add_subdirectory(${CUML_CPP_SRC} cuml-cpp EXCLUDE_FROM_ALL)
 
+    if(NOT CUDA_STATIC_MATH_LIBRARIES AND USE_CUDA_MATH_WHEELS)
+      set_property(TARGET ${CUML_CPP_TARGET} PROPERTY INSTALL_RPATH
+        "$ORIGIN/../nvidia/cublas/lib"
+        "$ORIGIN/../nvidia/cufft/lib"
+        "$ORIGIN/../nvidia/curand/lib"
+        "$ORIGIN/../nvidia/cusolver/lib"
+        "$ORIGIN/../nvidia/cusparse/lib"
+        "$ORIGIN/../nvidia/nvjitlink/lib"
+      )
+    endif()
+
     set(cython_lib_dir cuml)
     install(TARGETS ${CUML_CPP_TARGET} DESTINATION ${cython_lib_dir})
   endif()
diff --git a/python/cuml/cuml/cluster/dbscan.pyx b/python/cuml/cuml/cluster/dbscan.pyx
index b1a8dd5ae8..fff1eef3f9 100644
--- a/python/cuml/cuml/cluster/dbscan.pyx
+++ b/python/cuml/cuml/cluster/dbscan.pyx
@@ -22,7 +22,7 @@ from cuml.internals.safe_imports import gpu_only_import
 cp = gpu_only_import('cupy')
 
 from cuml.internals.array import CumlArray
-from cuml.internals.base import Base
+from cuml.internals.base import UniversalBase
 from cuml.common.doc_utils import generate_docstring
 from cuml.common.array_descriptor import CumlArrayDescriptor
 from cuml.internals.mixins import ClusterMixin
@@ -106,7 +106,7 @@ IF GPUBUILD == 1:
                       bool opg) except +
 
 
-class DBSCAN(Base,
+class DBSCAN(UniversalBase,
              ClusterMixin,
              CMajorInputTagMixin):
     """
@@ -222,8 +222,8 @@ class DBSCAN(Base,
     """
 
     _cpu_estimator_import_path = 'sklearn.cluster.DBSCAN'
-    labels_ = CumlArrayDescriptor()
-    core_sample_indices_ = CumlArrayDescriptor()
+    core_sample_indices_ = CumlArrayDescriptor(order="C")
+    labels_ = CumlArrayDescriptor(order="C")
 
     @device_interop_preparation
     def __init__(self, *,
@@ -268,7 +268,7 @@ class DBSCAN(Base,
                              "np.int32, np.int64}")
 
         IF GPUBUILD == 1:
-            X_m, n_rows, n_cols, self.dtype = \
+            X_m, n_rows, self.n_features_in_, self.dtype = \
                 input_to_cuml_array(
                     X,
                     order='C',
@@ -338,7 +338,7 @@ class DBSCAN(Base,
                     fit(handle_[0],
                         <float*>input_ptr,
                         <int> n_rows,
-                        <int> n_cols,
+                        <int> self.n_features_in_,
                         <float> self.eps,
                         <int> self.min_samples,
                         <DistanceType> metric,
@@ -353,7 +353,7 @@ class DBSCAN(Base,
                     fit(handle_[0],
                         <float*>input_ptr,
                         <int64_t> n_rows,
-                        <int64_t> n_cols,
+                        <int64_t> self.n_features_in_,
                         <float> self.eps,
                         <int> self.min_samples,
                         <DistanceType> metric,
@@ -370,7 +370,7 @@ class DBSCAN(Base,
                     fit(handle_[0],
                         <double*>input_ptr,
                         <int> n_rows,
-                        <int> n_cols,
+                        <int> self.n_features_in_,
                         <double> self.eps,
                         <int> self.min_samples,
                         <DistanceType> metric,
@@ -385,7 +385,7 @@ class DBSCAN(Base,
                     fit(handle_[0],
                         <double*>input_ptr,
                         <int64_t> n_rows,
-                        <int64_t> n_cols,
+                        <int64_t> self.n_features_in_,
                         <double> self.eps,
                         <int> self.min_samples,
                         <DistanceType> metric,
@@ -475,3 +475,6 @@ class DBSCAN(Base,
             "metric",
             "algorithm",
         ]
+
+    def get_attr_names(self):
+        return ["core_sample_indices_", "labels_", "n_features_in_"]
diff --git a/python/cuml/cuml/cluster/kmeans.pyx b/python/cuml/cuml/cluster/kmeans.pyx
index 760df6306b..e8ab51e4dd 100644
--- a/python/cuml/cuml/cluster/kmeans.pyx
+++ b/python/cuml/cuml/cluster/kmeans.pyx
@@ -38,7 +38,7 @@ IF GPUBUILD == 1:
 
 from cuml.internals.array import CumlArray
 from cuml.common.array_descriptor import CumlArrayDescriptor
-from cuml.internals.base import Base
+from cuml.internals.base import UniversalBase
 from cuml.common.doc_utils import generate_docstring
 from cuml.internals.mixins import ClusterMixin
 from cuml.internals.mixins import CMajorInputTagMixin
@@ -46,8 +46,10 @@ from cuml.common import input_to_cuml_array
 from cuml.internals.api_decorators import device_interop_preparation
 from cuml.internals.api_decorators import enable_device_interop
 
+from sklearn.utils._openmp_helpers import _openmp_effective_n_threads
 
-class KMeans(Base,
+
+class KMeans(UniversalBase,
              ClusterMixin,
              CMajorInputTagMixin):
 
@@ -188,8 +190,8 @@ class KMeans(Base,
     """
 
     _cpu_estimator_import_path = 'sklearn.cluster.KMeans'
-    labels_ = CumlArrayDescriptor()
-    cluster_centers_ = CumlArrayDescriptor()
+    labels_ = CumlArrayDescriptor(order='C')
+    cluster_centers_ = CumlArrayDescriptor(order='C')
 
     def _get_kmeans_params(self):
         IF GPUBUILD == 1:
@@ -232,6 +234,9 @@ class KMeans(Base,
         self.labels_ = None
         self.cluster_centers_ = None
 
+        # For sklearn interoperability
+        self._n_threads = _openmp_effective_n_threads()
+
         # cuPy does not allow comparing with string. See issue #2372
         init_str = init if isinstance(init, str) else None
 
@@ -258,7 +263,7 @@ class KMeans(Base,
 
             IF GPUBUILD == 1:
                 self._params_init = Array
-            self.cluster_centers_, _n_rows, self.n_cols, self.dtype = \
+            self.cluster_centers_, _n_rows, self.n_features_in_, self.dtype = \
                 input_to_cuml_array(
                     init, order='C',
                     convert_to_dtype=(np.float32 if convert_dtype
@@ -274,7 +279,7 @@ class KMeans(Base,
 
         """
         if self.init == 'preset':
-            check_cols = self.n_cols
+            check_cols = self.n_features_in_
             check_dtype = self.dtype
             target_dtype = self.dtype
         else:
@@ -282,7 +287,7 @@ class KMeans(Base,
             check_dtype = [np.float32, np.float64]
             target_dtype = np.float32
 
-        _X_m, _n_rows, self.n_cols, self.dtype = \
+        _X_m, _n_rows, self.n_features_in_, self.dtype = \
             input_to_cuml_array(X,
                                 order='C',
                                 check_cols=check_cols,
@@ -306,14 +311,14 @@ class KMeans(Base,
 
             cdef uintptr_t sample_weight_ptr = sample_weight_m.ptr
 
-            int_dtype = np.int32 if np.int64(_n_rows) * np.int64(self.n_cols) < 2**31-1 else np.int64
+            int_dtype = np.int32 if np.int64(_n_rows) * np.int64(self.n_features_in_) < 2**31-1 else np.int64
 
             self.labels_ = CumlArray.zeros(shape=_n_rows, dtype=int_dtype)
             cdef uintptr_t labels_ptr = self.labels_.ptr
 
             if (self.init in ['scalable-k-means++', 'k-means||', 'random']):
                 self.cluster_centers_ = \
-                    CumlArray.zeros(shape=(self.n_clusters, self.n_cols),
+                    CumlArray.zeros(shape=(self.n_clusters, self.n_features_in_),
                                     dtype=self.dtype, order='C')
 
             cdef uintptr_t cluster_centers_ptr = self.cluster_centers_.ptr
@@ -334,7 +339,7 @@ class KMeans(Base,
                         <KMeansParams> deref(params),
                         <const float*> input_ptr,
                         <int> _n_rows,
-                        <int> self.n_cols,
+                        <int> self.n_features_in_,
                         <const float *>sample_weight_ptr,
                         <float*> cluster_centers_ptr,
                         <int*> labels_ptr,
@@ -347,7 +352,7 @@ class KMeans(Base,
                         <KMeansParams> deref(params),
                         <const float*> input_ptr,
                         <int64_t> _n_rows,
-                        <int64_t> self.n_cols,
+                        <int64_t> self.n_features_in_,
                         <const float *>sample_weight_ptr,
                         <float*> cluster_centers_ptr,
                         <int64_t*> labels_ptr,
@@ -364,7 +369,7 @@ class KMeans(Base,
                         <KMeansParams> deref(params),
                         <const double*> input_ptr,
                         <int> _n_rows,
-                        <int> self.n_cols,
+                        <int> self.n_features_in_,
                         <const double *>sample_weight_ptr,
                         <double*> cluster_centers_ptr,
                         <int*> labels_ptr,
@@ -378,7 +383,7 @@ class KMeans(Base,
                          <KMeansParams> deref(params),
                          <const double*> input_ptr,
                          <int64_t> _n_rows,
-                         <int64_t> self.n_cols,
+                         <int64_t> self.n_features_in_,
                          <const double *>sample_weight_ptr,
                          <double*> cluster_centers_ptr,
                          <int64_t*> labels_ptr,
@@ -442,11 +447,13 @@ class KMeans(Base,
         Sum of squared distances of samples to their closest cluster center.
         """
 
+        self.dtype = self.cluster_centers_.dtype
+
         _X_m, _n_rows, _n_cols, _ = \
             input_to_cuml_array(X, order='C', check_dtype=self.dtype,
                                 convert_to_dtype=(self.dtype if convert_dtype
                                                   else None),
-                                check_cols=self.n_cols)
+                                check_cols=self.n_features_in_)
 
         IF GPUBUILD == 1:
             cdef uintptr_t input_ptr = _X_m.ptr
@@ -486,7 +493,7 @@ class KMeans(Base,
                         <float*> cluster_centers_ptr,
                         <float*> input_ptr,
                         <size_t> _n_rows,
-                        <size_t> self.n_cols,
+                        <size_t> self.n_features_in_,
                         <float *>sample_weight_ptr,
                         <bool> normalize_weights,
                         <int*> labels_ptr,
@@ -498,7 +505,7 @@ class KMeans(Base,
                         <float*> cluster_centers_ptr,
                         <float*> input_ptr,
                         <int64_t> _n_rows,
-                        <int64_t> self.n_cols,
+                        <int64_t> self.n_features_in_,
                         <float *>sample_weight_ptr,
                         <bool> normalize_weights,
                         <int64_t*> labels_ptr,
@@ -513,7 +520,7 @@ class KMeans(Base,
                         <double*> cluster_centers_ptr,
                         <double*> input_ptr,
                         <size_t> _n_rows,
-                        <size_t> self.n_cols,
+                        <size_t> self.n_features_in_,
                         <double *>sample_weight_ptr,
                         <bool> normalize_weights,
                         <int*> labels_ptr,
@@ -525,7 +532,7 @@ class KMeans(Base,
                         <double*> cluster_centers_ptr,
                         <double*> input_ptr,
                         <int64_t> _n_rows,
-                        <int64_t> self.n_cols,
+                        <int64_t> self.n_features_in_,
                         <double *>sample_weight_ptr,
                         <bool> normalize_weights,
                         <int64_t*> labels_ptr,
@@ -578,7 +585,7 @@ class KMeans(Base,
             input_to_cuml_array(X, order='C', check_dtype=self.dtype,
                                 convert_to_dtype=(self.dtype if convert_dtype
                                                   else None),
-                                check_cols=self.n_cols)
+                                check_cols=self.n_features_in_)
         IF GPUBUILD == 1:
             cdef uintptr_t input_ptr = _X_m.ptr
 
@@ -607,7 +614,7 @@ class KMeans(Base,
                         <float*> cluster_centers_ptr,
                         <float*> input_ptr,
                         <int> _n_rows,
-                        <int> self.n_cols,
+                        <int> self.n_features_in_,
                         <float*> preds_ptr)
                 else:
                     cpp_transform(
@@ -616,7 +623,7 @@ class KMeans(Base,
                         <float*> cluster_centers_ptr,
                         <float*> input_ptr,
                         <int64_t> _n_rows,
-                        <int64_t> self.n_cols,
+                        <int64_t> self.n_features_in_,
                         <float*> preds_ptr)
 
             elif self.dtype == np.float64:
@@ -627,7 +634,7 @@ class KMeans(Base,
                         <double*> cluster_centers_ptr,
                         <double*> input_ptr,
                         <int> _n_rows,
-                        <int> self.n_cols,
+                        <int> self.n_features_in_,
                         <double*> preds_ptr)
                 else:
                     cpp_transform(
@@ -636,7 +643,7 @@ class KMeans(Base,
                         <double*> cluster_centers_ptr,
                         <double*> input_ptr,
                         <int64_t> _n_rows,
-                        <int64_t> self.n_cols,
+                        <int64_t> self.n_features_in_,
                         <double*> preds_ptr)
 
             else:
@@ -685,3 +692,7 @@ class KMeans(Base,
             ['n_init', 'oversampling_factor', 'max_samples_per_batch',
                 'init', 'max_iter', 'n_clusters', 'random_state',
                 'tol', "convert_dtype"]
+
+    def get_attr_names(self):
+        return ['cluster_centers_', 'labels_', 'inertia_',
+                'n_iter_', 'n_features_in_', '_n_threads']
diff --git a/python/cuml/cuml/tests/dask/test_dask_logistic_regression.py b/python/cuml/cuml/tests/dask/test_dask_logistic_regression.py
index a78d507c75..dc6b63428d 100644
--- a/python/cuml/cuml/tests/dask/test_dask_logistic_regression.py
+++ b/python/cuml/cuml/tests/dask/test_dask_logistic_regression.py
@@ -682,7 +682,7 @@ def test_exception_one_label(fit_intercept, client):
     y = np.array([1.0, 1.0, 1.0, 1.0], datatype)
     X_df, y_df = _prep_training_data(client, X, y, n_parts)
 
-    err_msg = "This solver needs samples of at least 2 classes in the data, but the data contains only one class: 1.0"
+    err_msg = "This solver needs samples of at least 2 classes in the data, but the data contains only one class:.*1.0"
 
     from cuml.dask.linear_model import LogisticRegression as cumlLBFGS_dask
 
diff --git a/python/cuml/cuml/tests/test_device_selection.py b/python/cuml/cuml/tests/test_device_selection.py
index e5c2d9ce1a..1da3b0738e 100644
--- a/python/cuml/cuml/tests/test_device_selection.py
+++ b/python/cuml/cuml/tests/test_device_selection.py
@@ -32,7 +32,10 @@
 from cuml.internals.memory_utils import using_memory_type
 from cuml.internals.mem_type import MemoryType
 from cuml.decomposition import PCA, TruncatedSVD
+from cuml.cluster import KMeans
+from cuml.cluster import DBSCAN
 from cuml.common.device_selection import DeviceType, using_device_type
+from cuml.testing.utils import assert_dbscan_equal
 from hdbscan import HDBSCAN as refHDBSCAN
 from sklearn.neighbors import NearestNeighbors as skNearestNeighbors
 from sklearn.linear_model import Ridge as skRidge
@@ -42,6 +45,8 @@
 from sklearn.linear_model import LinearRegression as skLinearRegression
 from sklearn.decomposition import PCA as skPCA
 from sklearn.decomposition import TruncatedSVD as skTruncatedSVD
+from sklearn.cluster import KMeans as skKMeans
+from sklearn.cluster import DBSCAN as skDBSCAN
 from sklearn.datasets import make_regression, make_blobs
 from pytest_cases import fixture_union, fixture
 from importlib import import_module
@@ -136,7 +141,11 @@ def make_reg_dataset():
 
 def make_blob_dataset():
     X, y = make_blobs(
-        n_samples=2000, n_features=20, centers=20, random_state=0
+        n_samples=2000,
+        n_features=20,
+        centers=20,
+        random_state=0,
+        cluster_std=1.0,
     )
     X_train, X_test = X[:1800], X[1800:]
     y_train, _ = y[:1800], y[1800:]
@@ -948,3 +957,43 @@ def test_hdbscan_methods(train_device, infer_device):
     assert_membership_vectors(membership, ref_membership)
     assert adjusted_rand_score(labels, ref_labels) >= 0.98
     assert array_equal(probs, ref_probs, unit_tol=0.001, total_tol=0.006)
+
+
+@pytest.mark.parametrize("train_device", ["cpu", "gpu"])
+@pytest.mark.parametrize("infer_device", ["cpu", "gpu"])
+def test_kmeans_methods(train_device, infer_device):
+    n_clusters = 20
+    ref_model = skKMeans(n_clusters=n_clusters)
+    ref_model.fit(X_train_blob)
+    ref_output = ref_model.predict(X_test_blob)
+
+    model = KMeans(n_clusters=n_clusters)
+    with using_device_type(train_device):
+        model.fit(X_train_blob)
+    with using_device_type(infer_device):
+        output = model.predict(X_test_blob)
+
+    assert adjusted_rand_score(ref_output, output) >= 0.9
+
+
+@pytest.mark.parametrize("train_device", ["cpu", "gpu"])
+@pytest.mark.parametrize("infer_device", ["cpu", "gpu"])
+def test_dbscan_methods(train_device, infer_device):
+    eps = 8.0
+    ref_model = skDBSCAN(eps=eps)
+    ref_model.fit(X_train_blob)
+    ref_output = ref_model.fit_predict(X_train_blob)
+
+    model = DBSCAN(eps=eps)
+    with using_device_type(train_device):
+        model.fit(X_train_blob)
+    with using_device_type(infer_device):
+        output = model.fit_predict(X_train_blob)
+
+    assert array_equal(
+        ref_model.core_sample_indices_, ref_model.core_sample_indices_
+    )
+    assert adjusted_rand_score(ref_output, output) >= 0.95
+    assert_dbscan_equal(
+        ref_output, output, X_train_blob, model.core_sample_indices_, eps
+    )
diff --git a/python/cuml/pyproject.toml b/python/cuml/pyproject.toml
index 2073a5060e..0b5c650703 100644
--- a/python/cuml/pyproject.toml
+++ b/python/cuml/pyproject.toml
@@ -86,6 +86,11 @@ dependencies = [
     "dask-cudf==24.10.*,>=0.0.0a0",
     "joblib>=0.11",
     "numba>=0.57",
+    "nvidia-cublas",
+    "nvidia-cufft",
+    "nvidia-curand",
+    "nvidia-cusolver",
+    "nvidia-cusparse",
     "packaging",
     "pylibraft==24.10.*,>=0.0.0a0",
     "raft-dask==24.10.*,>=0.0.0a0",
@@ -104,9 +109,10 @@ classifiers = [
 
 [project.optional-dependencies]
 test = [
+    "cython>=3.0.0",
     "dask-glm==0.3.0",
     "dask-ml",
-    "hdbscan @ git+https://github.com/scikit-learn-contrib/hdbscan.git@master",
+    "hdbscan>=0.8.38,<0.8.39",
     "hypothesis>=6.0,<7",
     "nltk",
     "numpydoc",