rapidsai · viclafargue · Jan 22, 2025 · Jan 22, 2025 · Jan 23, 2025 · Jan 23, 2025
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2022, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2025, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -16,6 +16,8 @@
 
 #pragma once
 
+#include <stdint.h>
+
 namespace ML {
 
 // Dense input uses int64_t until FAISS is updated
@@ -55,8 +57,8 @@ struct knn_graph {
 template <typename T>
 struct manifold_inputs_t {
   T* y;
-  int n;
-  int d;
+  uint64_t n;
+  uint64_t d;
 
   manifold_inputs_t(T* y_, int n_, int d_) : y(y_), n(n_), d(d_) {}
 

@@ -37,6 +37,7 @@
 
 #include <cuvs/distance/distance.hpp>
 #include <pca/pca.cuh>
+#include <stdint.h>
 
 namespace ML {
 
@@ -167,7 +168,7 @@ class TSNE_runner {
   {
     distance_and_perplexity();
 
-    const auto NNZ  = COO_Matrix.nnz;
+    const auto NNZ  = value_idx{COO_Matrix.nnz};
     auto* VAL       = COO_Matrix.vals();
     const auto* COL = COO_Matrix.cols();
     const auto* ROW = COO_Matrix.rows();

@@ -30,6 +30,7 @@
 
 #include <cuda_runtime.h>
 
+#include <stdint.h>
 #include <stdio.h>
 
 #include <string>
@@ -91,8 +92,8 @@ CUML_KERNEL void smooth_knn_dist_kernel(const value_t* knn_dists,
                                         float bandwidth          = 1.0)
 {
   // row-based matrix 1 thread per row
-  int row = (blockIdx.x * TPB_X) + threadIdx.x;
-  int i   = row * n_neighbors;  // each thread processes one row of the dist matrix
+  int row    = (blockIdx.x * TPB_X) + threadIdx.x;
+  uint64_t i = uint64_t{row} * n_neighbors;  // each thread processes one row of the dist matrix
 
   if (row < n) {
     float target = __log2f(n_neighbors) * bandwidth;
@@ -190,7 +191,7 @@ CUML_KERNEL void smooth_knn_dist_kernel(const value_t* knn_dists,
  *
  * Descriptions adapted from: https://github.com/lmcinnes/umap/blob/master/umap/umap_.py
  */
-template <int TPB_X, typename value_idx, typename value_t>
+template <uint64_t TPB_X, typename value_idx, typename value_t>
 CUML_KERNEL void compute_membership_strength_kernel(
   const value_idx* knn_indices,
   const float* knn_dists,  // nn outputs
@@ -199,14 +200,14 @@ CUML_KERNEL void compute_membership_strength_kernel(
   value_t* vals,
   int* rows,
   int* cols,  // result coo
-  int n,
-  int n_neighbors)
+  int n_neighbors,
+  uint64_t to_process)
 {  // model params
 
   // row-based matrix is best
-  int idx = (blockIdx.x * TPB_X) + threadIdx.x;
+  uint64_t idx = (blockIdx.x * TPB_X) + threadIdx.x;
 
-  if (idx < n * n_neighbors) {
+  if (idx < to_process) {
     int row = idx / n_neighbors;  // one neighbor per thread
 
     double cur_rho   = rhos[row];
@@ -237,8 +238,8 @@ CUML_KERNEL void compute_membership_strength_kernel(
 /*
  * Sets up and runs the knn dist smoothing
  */
-template <int TPB_X, typename value_idx, typename value_t>
-void smooth_knn_dist(int n,
+template <uint64_t TPB_X, typename value_idx, typename value_t>
+void smooth_knn_dist(uint64_t n,
                      const value_idx* knn_indices,
                      const float* knn_dists,
                      value_t* rhos,
@@ -253,7 +254,8 @@ void smooth_knn_dist(int n,
 
   rmm::device_uvector<value_t> dist_means_dev(n_neighbors, stream);
 
-  raft::stats::mean(dist_means_dev.data(), knn_dists, 1, n_neighbors * n, false, false, stream);
+  raft::stats::mean(
+    dist_means_dev.data(), knn_dists, uint64_t{1}, n * n_neighbors, false, false, stream);
   RAFT_CUDA_TRY(cudaPeekAtLastError());
 
   value_t mean_dist = 0.0;
@@ -284,8 +286,8 @@ void smooth_knn_dist(int n,
  * @param params UMAPParams config object
  * @param stream cuda stream to use for device operations
  */
-template <int TPB_X, typename value_idx, typename value_t>
-void launcher(int n,
+template <uint64_t TPB_X, typename value_idx, typename value_t>
+void launcher(uint64_t n,
               const value_idx* knn_indices,
               const value_t* knn_dists,
               int n_neighbors,
@@ -328,7 +330,8 @@ void launcher(int n,
    * Compute graph of membership strengths
    */
 
-  dim3 grid_elm(raft::ceildiv(n * n_neighbors, TPB_X), 1, 1);
+  uint64_t to_process = {in.n_rows} * n_neighbors;
+  dim3 grid_elm(raft::ceildiv(to_process, TPB_X), 1, 1);
   dim3 blk_elm(TPB_X, 1, 1);
 
   compute_membership_strength_kernel<TPB_X><<<grid_elm, blk_elm, 0, stream>>>(knn_indices,
@@ -338,8 +341,8 @@ void launcher(int n,
                                                                               in.vals(),
                                                                               in.rows(),
                                                                               in.cols(),
-                                                                              in.n_rows,
-                                                                              n_neighbors);
+                                                                              n_neighbors,
+                                                                              to_process);
   RAFT_CUDA_TRY(cudaPeekAtLastError());
 
   if (ML::default_logger().should_log(ML::level_enum::debug)) {

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2024, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2025, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -20,14 +20,16 @@
 
 #include <raft/random/rng.cuh>
 
+#include <stdint.h>
+
 namespace UMAPAlgo {
 namespace InitEmbed {
 namespace RandomInit {
 
 using namespace ML;
 
 template <typename T>
-void launcher(int n, int d, UMAPParams* params, T* embedding, cudaStream_t stream)
+void launcher(uint64_t n, int d, UMAPParams* params, T* embedding, cudaStream_t stream)
 {
   uint64_t seed = params->random_state;
 

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2024, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2025, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -29,6 +29,8 @@
 #include <thrust/execution_policy.h>
 #include <thrust/extrema.h>
 
+#include <stdint.h>
+
 #include <iostream>
 
 namespace UMAPAlgo {
@@ -44,7 +46,7 @@ using namespace ML;
  */
 template <typename T>
 void launcher(const raft::handle_t& handle,
-              int n,
+              uint64_t n,
               int d,
               raft::sparse::COO<float>* coo,
               UMAPParams* params,

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2024, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2025, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -35,6 +35,7 @@
 
 #include <cuvs/distance/distance.hpp>
 #include <cuvs/neighbors/brute_force.hpp>
+#include <stdint.h>
 
 #include <iostream>
 
@@ -126,11 +127,12 @@ inline void launcher(const raft::handle_t& handle,
 
     RAFT_EXPECTS(graph.distances().has_value(),
                  "return_distances for nn descent should be set to true to be used for UMAP");
-    auto out_knn_dists_view = raft::make_device_matrix_view(out.knn_dists, inputsA.n, n_neighbors);
+    auto out_knn_dists_view =
+      raft::make_device_matrix_view(out.knn_dists, inputsA.n, uint64_t{n_neighbors});
     raft::matrix::slice<float, int64_t, raft::row_major>(
       handle, raft::make_const_mdspan(graph.distances().value()), out_knn_dists_view, coords);
     auto out_knn_indices_view =
-      raft::make_device_matrix_view(out.knn_indices, inputsA.n, n_neighbors);
+      raft::make_device_matrix_view(out.knn_indices, inputsA.n, uint64_t{n_neighbors});
     raft::matrix::slice<int64_t, int64_t, raft::row_major>(
       handle, raft::make_const_mdspan(indices_d.view()), out_knn_indices_view, coords);
   }

@@ -47,6 +47,8 @@
 #include <thrust/scan.h>
 #include <thrust/system/cuda/execution_policy.h>
 
+#include <stdint.h>
+
 #include <memory>
 
 namespace UMAPAlgo {
@@ -348,7 +350,7 @@ void _fit_supervised(const raft::handle_t& handle,
 /**
  *
  */
-template <typename value_idx, typename value_t, typename umap_inputs, int TPB_X>
+template <typename value_idx, typename value_t, typename umap_inputs, uint64_t TPB_X>
 void _transform(const raft::handle_t& handle,
                 const umap_inputs& inputs,
                 umap_inputs& orig_x_inputs,
@@ -425,7 +427,7 @@ void _transform(const raft::handle_t& handle,
    * Compute graph of membership strengths
    */
 
-  int nnz = inputs.n * params->n_neighbors;
+  uint64_t nnz = uint64_t{inputs.n} * params->n_neighbors;
 
   dim3 grid_nnz(raft::ceildiv(nnz, TPB_X), 1, 1);
 
@@ -437,6 +439,7 @@ void _transform(const raft::handle_t& handle,
 
   raft::sparse::COO<value_t> graph_coo(stream, nnz, inputs.n, inputs.n);
 
+  uint64_t to_process = uint64_t{graph_coo.n_rows} * params->n_neighbors;
   FuzzySimplSetImpl::compute_membership_strength_kernel<TPB_X>
     <<<grid_nnz, blk, 0, stream>>>(knn_graph.knn_indices,
                                    knn_graph.knn_dists,
@@ -445,15 +448,13 @@ void _transform(const raft::handle_t& handle,
                                    graph_coo.vals(),
                                    graph_coo.rows(),
                                    graph_coo.cols(),
-                                   graph_coo.n_rows,
-                                   params->n_neighbors);
+                                   params->n_neighbors,
+                                   to_process);
   RAFT_CUDA_TRY(cudaPeekAtLastError());
 
-  rmm::device_uvector<int> row_ind(inputs.n, stream);
-  rmm::device_uvector<int> ia(inputs.n, stream);
+  rmm::device_uvector<uint64_t> row_ind(inputs.n, stream);
 
   raft::sparse::convert::sorted_coo_to_csr(&graph_coo, row_ind.data(), stream);
-  raft::sparse::linalg::coo_degree(&graph_coo, ia.data(), stream);
 
   rmm::device_uvector<value_t> vals_normed(graph_coo.nnz, stream);
   RAFT_CUDA_TRY(cudaMemsetAsync(vals_normed.data(), 0, graph_coo.nnz * sizeof(value_t), stream));
@@ -471,9 +472,6 @@ void _transform(const raft::handle_t& handle,
                                                              params->n_components,
                                                              transformed,
                                                              params->n_neighbors);
-  RAFT_CUDA_TRY(cudaPeekAtLastError());
-
-  RAFT_CUDA_TRY(cudaMemsetAsync(ia.data(), 0.0, ia.size() * sizeof(int), stream));
 
   RAFT_CUDA_TRY(cudaPeekAtLastError());