rapidsai · viclafargue · Jan 22, 2025 · Jan 22, 2025 · Jan 23, 2025 · Jan 23, 2025
@@ -76,7 +76,7 @@ dependencies:
 - sphinx-markdown-tables
 - statsmodels
 - sysroot_linux-64==2.28
-- treelite==4.4.1
+- treelite==4.3.0
 - umap-learn==0.5.6
 - xgboost>=2.1.0
 name: all_cuda-118_arch-x86_64
@@ -72,7 +72,7 @@ dependencies:
 - sphinx-markdown-tables
 - statsmodels
 - sysroot_linux-64==2.28
-- treelite==4.4.1
+- treelite==4.3.0
 - umap-learn==0.5.6
 - xgboost>=2.1.0
 name: all_cuda-128_arch-x86_64
@@ -20,4 +20,4 @@ c_stdlib_version:
   - "=2.28"
 
 treelite_version:
-  - "=4.4.1"
+  - "=4.3.0"
@@ -26,7 +26,7 @@ spdlog_version:
   - ">=1.14.1,<1.15"
 
 treelite_version:
-  - "=4.4.1"
+  - "=4.3.0"
 
 # The CTK libraries below are missing from the conda-forge::cudatoolkit package
 # for CUDA 11. The "*_host_*" version specifiers correspond to `11.8` packages

@@ -73,8 +73,8 @@ endfunction()
 # To use a different CUVS locally, set the CMake variable
 # CPM_cuvs_SOURCE=/path/to/local/cuvs
 find_and_configure_cuvs(VERSION          ${CUML_MIN_VERSION_cuvs}
-      FORK             rapidsai
-      PINNED_TAG       branch-${CUML_BRANCH_VERSION_cuvs}
+      FORK             divyegala
+      PINNED_TAG       raft-sparse-updates
       EXCLUDE_FROM_ALL ${CUML_EXCLUDE_CUVS_FROM_ALL}
       # When PINNED_TAG above doesn't match cuml,
       # force local cuvs clone in build directory

@@ -72,8 +72,8 @@ endfunction()
 # To use a different RAFT locally, set the CMake variable
 # CPM_raft_SOURCE=/path/to/local/raft
 find_and_configure_raft(VERSION          ${CUML_MIN_VERSION_raft}
-      FORK             rapidsai
-      PINNED_TAG       branch-${CUML_BRANCH_VERSION_raft}
+      FORK                     viclafargue
+      PINNED_TAG               fix-sparse-utilities
       EXCLUDE_FROM_ALL ${CUML_EXCLUDE_RAFT_FROM_ALL}
       # When PINNED_TAG above doesn't match cuml,
       # force local raft clone in build directory

@@ -79,7 +79,7 @@ function(find_and_configure_treelite)
     rapids_export_find_package_root(BUILD Treelite [=[${CMAKE_CURRENT_LIST_DIR}]=] EXPORT_SET cuml-exports)
 endfunction()
 
-find_and_configure_treelite(VERSION     4.4.1
-                        PINNED_TAG  386bd0de99f5a66584c7e58221ee38ce606ad1ae
+find_and_configure_treelite(VERSION     4.3.0
+                        PINNED_TAG  575e4208f2b18e40d818c338ecb95d7a26e69aab
                         EXCLUDE_FROM_ALL  ${CUML_EXCLUDE_TREELITE_FROM_ALL}
                         BUILD_STATIC_LIBS ${CUML_USE_TREELITE_STATIC})
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2025, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -16,6 +16,8 @@
 
 #pragma once
 
+#include <stdint.h>
+
 namespace ML {
 
 // Dense input uses int64_t until FAISS is updated
@@ -55,8 +57,8 @@ struct knn_graph {
 template <typename T>
 struct manifold_inputs_t {
   T* y;
-  int n;
-  int d;
+  uint64_t n;
+  uint64_t d;
 
   manifold_inputs_t(T* y_, int n_, int d_) : y(y_), n(n_), d(d_) {}
 

@@ -37,6 +37,7 @@
 
 #include <cuvs/distance/distance.hpp>
 #include <pca/pca.cuh>
+#include <stdint.h>
 
 namespace ML {
 
@@ -167,7 +168,7 @@ class TSNE_runner {
   {
     distance_and_perplexity();
 
-    const auto NNZ  = COO_Matrix.nnz;
+    const auto NNZ  = (value_idx)COO_Matrix.nnz;
-    const auto NNZ  = (value_idx)COO_Matrix.nnz;
+    const auto NNZ  = value_idx(COO_Matrix.nnz);
-    const auto NNZ  = (value_idx)COO_Matrix.nnz;
+    const auto NNZ  = value_idx(COO_Matrix.nnz);
     auto* VAL       = COO_Matrix.vals();
     const auto* COL = COO_Matrix.cols();
     const auto* ROW = COO_Matrix.rows();

@@ -30,6 +30,7 @@
 
 #include <cuda_runtime.h>
 
+#include <stdint.h>
 #include <stdio.h>
 
 #include <string>
@@ -92,7 +93,8 @@ CUML_KERNEL void smooth_knn_dist_kernel(const value_t* knn_dists,
 {
   // row-based matrix 1 thread per row
   int row = (blockIdx.x * TPB_X) + threadIdx.x;
-  int i   = row * n_neighbors;  // each thread processes one row of the dist matrix
+  uint64_t i =
+    static_cast<uint64_t>(row) * n_neighbors;  // each thread processes one row of the dist matrix
 
   if (row < n) {
     float target = __log2f(n_neighbors) * bandwidth;
@@ -190,7 +192,7 @@ CUML_KERNEL void smooth_knn_dist_kernel(const value_t* knn_dists,
  *
  * Descriptions adapted from: https://github.com/lmcinnes/umap/blob/master/umap/umap_.py
  */
-template <int TPB_X, typename value_idx, typename value_t>
+template <uint64_t TPB_X, typename value_idx, typename value_t>
 CUML_KERNEL void compute_membership_strength_kernel(
   const value_idx* knn_indices,
   const float* knn_dists,  // nn outputs
@@ -199,14 +201,14 @@ CUML_KERNEL void compute_membership_strength_kernel(
   value_t* vals,
   int* rows,
   int* cols,  // result coo
-  int n,
-  int n_neighbors)
+  int n_neighbors,
+  uint64_t to_process)
 {  // model params
 
   // row-based matrix is best
-  int idx = (blockIdx.x * TPB_X) + threadIdx.x;
+  uint64_t idx = (blockIdx.x * TPB_X) + threadIdx.x;
 
-  if (idx < n * n_neighbors) {
+  if (idx < to_process) {
     int row = idx / n_neighbors;  // one neighbor per thread
 
     double cur_rho   = rhos[row];
@@ -237,8 +239,8 @@ CUML_KERNEL void compute_membership_strength_kernel(
 /*
  * Sets up and runs the knn dist smoothing
  */
-template <int TPB_X, typename value_idx, typename value_t>
-void smooth_knn_dist(int n,
+template <uint64_t TPB_X, typename value_idx, typename value_t>
+void smooth_knn_dist(uint64_t n,
                      const value_idx* knn_indices,
                      const float* knn_dists,
                      value_t* rhos,
@@ -253,7 +255,8 @@ void smooth_knn_dist(int n,
 
   rmm::device_uvector<value_t> dist_means_dev(n_neighbors, stream);
 
-  raft::stats::mean(dist_means_dev.data(), knn_dists, 1, n_neighbors * n, false, false, stream);
+  raft::stats::mean(
+    dist_means_dev.data(), knn_dists, uint64_t{1}, n * n_neighbors, false, false, stream);
   RAFT_CUDA_TRY(cudaPeekAtLastError());
 
   value_t mean_dist = 0.0;
@@ -284,8 +287,8 @@ void smooth_knn_dist(int n,
  * @param params UMAPParams config object
  * @param stream cuda stream to use for device operations
  */
-template <int TPB_X, typename value_idx, typename value_t>
-void launcher(int n,
+template <uint64_t TPB_X, typename value_idx, typename value_t>
+void launcher(uint64_t n,
               const value_idx* knn_indices,
               const value_t* knn_dists,
               int n_neighbors,
@@ -328,7 +331,8 @@ void launcher(int n,
    * Compute graph of membership strengths
    */
 
-  dim3 grid_elm(raft::ceildiv(n * n_neighbors, TPB_X), 1, 1);
+  uint64_t to_process = static_cast<uint64_t>(in.n_rows) * n_neighbors;
+  dim3 grid_elm(raft::ceildiv(to_process, TPB_X), 1, 1);
   dim3 blk_elm(TPB_X, 1, 1);
 
   compute_membership_strength_kernel<TPB_X><<<grid_elm, blk_elm, 0, stream>>>(knn_indices,
@@ -338,8 +342,8 @@ void launcher(int n,
                                                                               in.vals(),
                                                                               in.rows(),
                                                                               in.cols(),
-                                                                              in.n_rows,
-                                                                              n_neighbors);
+                                                                              n_neighbors,
+                                                                              to_process);
   RAFT_CUDA_TRY(cudaPeekAtLastError());
 
   if (ML::default_logger().should_log(ML::level_enum::debug)) {

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2024, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2025, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -20,14 +20,16 @@
 
 #include <raft/random/rng.cuh>
 
+#include <stdint.h>
+
 namespace UMAPAlgo {
 namespace InitEmbed {
 namespace RandomInit {
 
 using namespace ML;
 
 template <typename T>
-void launcher(int n, int d, UMAPParams* params, T* embedding, cudaStream_t stream)
+void launcher(uint64_t n, int d, UMAPParams* params, T* embedding, cudaStream_t stream)
 {
   uint64_t seed = params->random_state;
 

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2024, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2025, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -29,6 +29,8 @@
 #include <thrust/execution_policy.h>
 #include <thrust/extrema.h>
 
+#include <stdint.h>
+
 #include <iostream>
 
 namespace UMAPAlgo {
@@ -44,15 +46,16 @@ using namespace ML;
  */
 template <typename T>
 void launcher(const raft::handle_t& handle,
-              int n,
+              uint64_t n,
               int d,
               raft::sparse::COO<float>* coo,
               UMAPParams* params,
               T* embedding)
 {
   cudaStream_t stream = handle.get_stream();
 
-  ASSERT(n > params->n_components, "Spectral layout requires n_samples > n_components");
+  ASSERT(n > static_cast<uint64_t>(params->n_components),
+         "Spectral layout requires n_samples > n_components");
 
   rmm::device_uvector<T> tmp_storage(n * params->n_components, stream);
 

@@ -35,6 +35,7 @@
 
 #include <cuvs/distance/distance.hpp>
 #include <cuvs/neighbors/brute_force.hpp>
+#include <stdint.h>
 
 #include <iostream>
 
@@ -138,9 +139,10 @@ inline void launcher(const raft::handle_t& handle,
         target[i * n_neighbors + j] = source[i * graph_degree + j];
       }
     }
-    raft::copy(handle,
-               raft::make_device_matrix_view(out.knn_indices, inputsA.n, n_neighbors),
-               temp_indices_h.view());
+    raft::copy(
+      handle,
+      raft::make_device_matrix_view(out.knn_indices, inputsA.n, static_cast<uint64_t>(n_neighbors)),
+      temp_indices_h.view());
 
     // `graph.distances()` is a device array (n x graph_degree).
     // Slice and copy to the output device array `out.knn_dists` (n x n_neighbors).
@@ -152,7 +154,7 @@ inline void launcher(const raft::handle_t& handle,
     raft::matrix::slice<float, int64_t, raft::row_major>(
       handle,
       raft::make_const_mdspan(graph.distances().value()),
-      raft::make_device_matrix_view(out.knn_dists, inputsA.n, n_neighbors),
+      raft::make_device_matrix_view(out.knn_dists, inputsA.n, static_cast<uint64_t>(n_neighbors)),
       coords);
   }
 }

@@ -47,6 +47,8 @@
 #include <thrust/scan.h>
 #include <thrust/system/cuda/execution_policy.h>
 
+#include <stdint.h>
+
 #include <memory>
 
 namespace UMAPAlgo {
@@ -348,7 +350,7 @@ void _fit_supervised(const raft::handle_t& handle,
 /**
  *
  */
-template <typename value_idx, typename value_t, typename umap_inputs, int TPB_X>
+template <typename value_idx, typename value_t, typename umap_inputs, uint64_t TPB_X>
 void _transform(const raft::handle_t& handle,
                 const umap_inputs& inputs,
                 umap_inputs& orig_x_inputs,
@@ -425,7 +427,7 @@ void _transform(const raft::handle_t& handle,
    * Compute graph of membership strengths
    */
 
-  int nnz = inputs.n * params->n_neighbors;
+  uint64_t nnz = uint64_t{inputs.n} * params->n_neighbors;
 
   dim3 grid_nnz(raft::ceildiv(nnz, TPB_X), 1, 1);
 
@@ -437,6 +439,7 @@ void _transform(const raft::handle_t& handle,
 
   raft::sparse::COO<value_t> graph_coo(stream, nnz, inputs.n, inputs.n);
 
+  uint64_t to_process = static_cast<uint64_t>(graph_coo.n_rows) * params->n_neighbors;
   FuzzySimplSetImpl::compute_membership_strength_kernel<TPB_X>
     <<<grid_nnz, blk, 0, stream>>>(knn_graph.knn_indices,
                                    knn_graph.knn_dists,
@@ -445,15 +448,13 @@ void _transform(const raft::handle_t& handle,
                                    graph_coo.vals(),
                                    graph_coo.rows(),
                                    graph_coo.cols(),
-                                   graph_coo.n_rows,
-                                   params->n_neighbors);
+                                   params->n_neighbors,
+                                   to_process);
   RAFT_CUDA_TRY(cudaPeekAtLastError());
 
-  rmm::device_uvector<int> row_ind(inputs.n, stream);
-  rmm::device_uvector<int> ia(inputs.n, stream);
+  rmm::device_uvector<uint64_t> row_ind(inputs.n, stream);
 
   raft::sparse::convert::sorted_coo_to_csr(&graph_coo, row_ind.data(), stream);
-  raft::sparse::linalg::coo_degree(&graph_coo, ia.data(), stream);
 
   rmm::device_uvector<value_t> vals_normed(graph_coo.nnz, stream);
   RAFT_CUDA_TRY(cudaMemsetAsync(vals_normed.data(), 0, graph_coo.nnz * sizeof(value_t), stream));
@@ -471,9 +472,6 @@ void _transform(const raft::handle_t& handle,
                                                              params->n_components,
                                                              transformed,
                                                              params->n_neighbors);
-  RAFT_CUDA_TRY(cudaPeekAtLastError());
-
-  RAFT_CUDA_TRY(cudaMemsetAsync(ia.data(), 0.0, ia.size() * sizeof(int), stream));
 
   RAFT_CUDA_TRY(cudaPeekAtLastError());