From 2e06ab95babc0867fda210c8acfd54adbb05443c Mon Sep 17 00:00:00 2001
From: viclafargue <viclafargue@nvidia.com>
Date: Tue, 14 Jan 2025 15:04:36 +0000
Subject: [PATCH 01/25] Fix sparse utilities

---
 .../raft/cluster/detail/connectivities.cuh    |  2 +-
 cpp/include/raft/cluster/detail/mst.cuh       |  4 +-
 cpp/include/raft/sparse/convert/csr.cuh       | 10 ++---
 .../raft/sparse/convert/detail/csr.cuh        |  8 ++--
 cpp/include/raft/sparse/detail/coo.cuh        | 14 +++----
 cpp/include/raft/sparse/detail/utils.h        |  6 +--
 cpp/include/raft/sparse/linalg/degree.cuh     |  6 +--
 .../raft/sparse/linalg/detail/degree.cuh      | 20 +++++-----
 .../raft/sparse/linalg/detail/norm.cuh        | 22 +++++------
 .../raft/sparse/linalg/detail/symmetrize.cuh  | 18 ++++-----
 cpp/include/raft/sparse/linalg/norm.cuh       |  8 ++--
 .../neighbors/detail/cross_component_nn.cuh   |  2 +-
 cpp/include/raft/sparse/op/detail/filter.cuh  | 37 +++++++++----------
 cpp/include/raft/sparse/op/detail/sort.h      |  2 +-
 cpp/include/raft/sparse/op/sort.cuh           |  2 +-
 .../raft/spatial/knn/detail/ball_cover.cuh    |  2 +-
 cpp/test/sparse/solver/lanczos.cu             |  4 +-
 cpp/test/sparse/symmetrize.cu                 |  4 +-
 18 files changed, 85 insertions(+), 86 deletions(-)
diff --git a/cpp/include/raft/cluster/detail/connectivities.cuh b/cpp/include/raft/cluster/detail/connectivities.cuh
index c527b754c3..fdb8af9171 100644
--- a/cpp/include/raft/cluster/detail/connectivities.cuh
+++ b/cpp/include/raft/cluster/detail/connectivities.cuh
@@ -95,7 +95,7 @@ struct distance_graph_impl<raft::cluster::LinkageDistance::KNN_GRAPH, value_idx,
                       });
 
     raft::sparse::convert::sorted_coo_to_csr(
-      knn_graph_coo.rows(), knn_graph_coo.nnz, indptr.data(), m + 1, stream);
+      knn_graph_coo.rows(), (value_idx)knn_graph_coo.nnz, indptr.data(), m + 1, stream);
 
     // TODO: Wouldn't need to copy here if we could compute knn
     // graph directly on the device uvectors
diff --git a/cpp/include/raft/cluster/detail/mst.cuh b/cpp/include/raft/cluster/detail/mst.cuh
index 55becc8e15..fff2a535ec 100644
--- a/cpp/include/raft/cluster/detail/mst.cuh
+++ b/cpp/include/raft/cluster/detail/mst.cuh
@@ -98,7 +98,7 @@ void connect_knn_graph(
 
   rmm::device_uvector<value_idx> indptr2(m + 1, stream);
   raft::sparse::convert::sorted_coo_to_csr(
-    connected_edges.rows(), connected_edges.nnz, indptr2.data(), m + 1, stream);
+    connected_edges.rows(), (value_idx)connected_edges.nnz, indptr2.data(), m + 1, stream);
 
   // On the second call, we hand the MST the original colors
   // and the new set of edges and let it restart the optimization process
@@ -204,4 +204,4 @@ void build_sorted_mst(
   raft::copy_async(mst_weight, mst_coo.weights.data(), mst_coo.n_edges, stream);
 }
 
-};  // namespace raft::cluster::detail
\ No newline at end of file
+};  // namespace raft::cluster::detail
diff --git a/cpp/include/raft/sparse/convert/csr.cuh b/cpp/include/raft/sparse/convert/csr.cuh
index 081192ed44..cbe20d8d3a 100644
--- a/cpp/include/raft/sparse/convert/csr.cuh
+++ b/cpp/include/raft/sparse/convert/csr.cuh
@@ -52,8 +52,8 @@ void coo_to_csr(raft::resources const& handle,
  * @param m: number of rows in dense matrix
  * @param stream: cuda stream to use
  */
-template <typename T>
-void sorted_coo_to_csr(const T* rows, int nnz, T* row_ind, int m, cudaStream_t stream)
+template <typename T, typename outT>
+void sorted_coo_to_csr(const T* rows, outT nnz, outT* row_ind, int m, cudaStream_t stream)
 {
   detail::sorted_coo_to_csr(rows, nnz, row_ind, m, stream);
 }
@@ -65,10 +65,10 @@ void sorted_coo_to_csr(const T* rows, int nnz, T* row_ind, int m, cudaStream_t s
  * @param row_ind: output row indices array
  * @param stream: cuda stream to use
  */
-template <typename T>
-void sorted_coo_to_csr(COO<T>* coo, int* row_ind, cudaStream_t stream)
+template <typename T, typename outT>
+void sorted_coo_to_csr(COO<T>* coo, outT* row_ind, cudaStream_t stream)
 {
-  detail::sorted_coo_to_csr(coo->rows(), coo->nnz, row_ind, coo->n_rows, stream);
+  detail::sorted_coo_to_csr(coo->rows(), (outT)coo->nnz, row_ind, coo->n_rows, stream);
 }
 
 /**
diff --git a/cpp/include/raft/sparse/convert/detail/csr.cuh b/cpp/include/raft/sparse/convert/detail/csr.cuh
index a5d7de9a07..79ed0d36ff 100644
--- a/cpp/include/raft/sparse/convert/detail/csr.cuh
+++ b/cpp/include/raft/sparse/convert/detail/csr.cuh
@@ -84,18 +84,18 @@ void coo_to_csr(raft::resources const& handle,
  * @param m: number of rows in dense matrix
  * @param stream: cuda stream to use
  */
-template <typename T>
-void sorted_coo_to_csr(const T* rows, int nnz, T* row_ind, int m, cudaStream_t stream)
+template <typename T, typename outT>
+void sorted_coo_to_csr(const T* rows, outT nnz, outT* row_ind, int m, cudaStream_t stream)
 {
   rmm::device_uvector<T> row_counts(m, stream);
 
-  RAFT_CUDA_TRY(cudaMemsetAsync(row_counts.data(), 0, m * sizeof(T), stream));
+  RAFT_CUDA_TRY(cudaMemsetAsync(row_counts.data(), 0, (uint64_t)m * sizeof(T), stream));
 
   linalg::coo_degree(rows, nnz, row_counts.data(), stream);
 
   // create csr compressed row index from row counts
   thrust::device_ptr<T> row_counts_d = thrust::device_pointer_cast(row_counts.data());
-  thrust::device_ptr<T> c_ind_d      = thrust::device_pointer_cast(row_ind);
+  thrust::device_ptr<outT> c_ind_d      = thrust::device_pointer_cast(row_ind);
   exclusive_scan(rmm::exec_policy(stream), row_counts_d, row_counts_d + m, c_ind_d);
 }
 
diff --git a/cpp/include/raft/sparse/detail/coo.cuh b/cpp/include/raft/sparse/detail/coo.cuh
index 9a38c11a07..0d2a06bf03 100644
--- a/cpp/include/raft/sparse/detail/coo.cuh
+++ b/cpp/include/raft/sparse/detail/coo.cuh
@@ -52,7 +52,7 @@ class COO {
   rmm::device_uvector<T> vals_arr;
 
  public:
-  Index_Type nnz;
+  uint64_t nnz;
   Index_Type n_rows;
   Index_Type n_cols;
 
@@ -75,7 +75,7 @@ class COO {
   COO(rmm::device_uvector<Index_Type>& rows,
       rmm::device_uvector<Index_Type>& cols,
       rmm::device_uvector<T>& vals,
-      Index_Type nnz,
+      uint64_t nnz,
       Index_Type n_rows = 0,
       Index_Type n_cols = 0)
     : rows_arr(rows), cols_arr(cols), vals_arr(vals), nnz(nnz), n_rows(n_rows), n_cols(n_cols)
@@ -90,7 +90,7 @@ class COO {
    * @param init: initialize arrays with zeros
    */
   COO(cudaStream_t stream,
-      Index_Type nnz,
+      uint64_t nnz,
       Index_Type n_rows = 0,
       Index_Type n_cols = 0,
       bool init         = true)
@@ -121,7 +121,7 @@ class COO {
    */
   bool validate_size() const
   {
-    if (this->nnz < 0 || n_rows < 0 || n_cols < 0) return false;
+    if (this->nnz <= 0 || n_rows <= 0 || n_cols <= 0) return false;
     return true;
   }
 
@@ -204,7 +204,7 @@ class COO {
    * @param init: should values be initialized to 0?
    * @param stream: CUDA stream to use
    */
-  void allocate(Index_Type nnz, bool init, cudaStream_t stream)
+  void allocate(uint64_t nnz, bool init, cudaStream_t stream)
   {
     this->allocate(nnz, 0, init, stream);
   }
@@ -216,7 +216,7 @@ class COO {
    * @param init: should values be initialized to 0?
    * @param stream: CUDA stream to use
    */
-  void allocate(Index_Type nnz, Index_Type size, bool init, cudaStream_t stream)
+  void allocate(uint64_t nnz, Index_Type size, bool init, cudaStream_t stream)
   {
     this->allocate(nnz, size, size, init, stream);
   }
@@ -230,7 +230,7 @@ class COO {
    * @param stream: stream to use for init
    */
   void allocate(
-    Index_Type nnz, Index_Type n_rows, Index_Type n_cols, bool init, cudaStream_t stream)
+    uint64_t nnz, Index_Type n_rows, Index_Type n_cols, bool init, cudaStream_t stream)
   {
     this->n_rows = n_rows;
     this->n_cols = n_cols;
diff --git a/cpp/include/raft/sparse/detail/utils.h b/cpp/include/raft/sparse/detail/utils.h
index 3eed74f3b4..16db863a2d 100644
--- a/cpp/include/raft/sparse/detail/utils.h
+++ b/cpp/include/raft/sparse/detail/utils.h
@@ -103,10 +103,10 @@ void iota_fill(value_idx* indices, value_idx nrows, value_idx ncols, cudaStream_
   iota_fill_block_kernel<<<nrows, blockdim, 0, stream>>>(indices, ncols);
 }
 
-template <typename T>
-__device__ int get_stop_idx(T row, T m, T nnz, const T* ind)
+template <typename T, typename indT>
+__device__ indT get_stop_idx(T row, T m, indT nnz, const indT* ind)
 {
-  int stop_idx = 0;
+  indT stop_idx = 0;
   if (row < (m - 1))
     stop_idx = ind[row + 1];
   else
diff --git a/cpp/include/raft/sparse/linalg/degree.cuh b/cpp/include/raft/sparse/linalg/degree.cuh
index 57c9b986b4..5da4c9a30d 100644
--- a/cpp/include/raft/sparse/linalg/degree.cuh
+++ b/cpp/include/raft/sparse/linalg/degree.cuh
@@ -34,7 +34,7 @@ namespace linalg {
  * @param stream: cuda stream to use
  */
 template <typename T = int>
-void coo_degree(const T* rows, int nnz, T* results, cudaStream_t stream)
+void coo_degree(const T* rows, uint64_t nnz, T* results, cudaStream_t stream)
 {
   detail::coo_degree<64, T>(rows, nnz, results, stream);
 }
@@ -66,7 +66,7 @@ void coo_degree(COO<T>* in, int* results, cudaStream_t stream)
  */
 template <typename T>
 void coo_degree_scalar(
-  const int* rows, const T* vals, int nnz, T scalar, int* results, cudaStream_t stream = 0)
+  const int* rows, const T* vals, uint64_t nnz, T scalar, int* results, cudaStream_t stream = 0)
 {
   detail::coo_degree_scalar<64>(rows, vals, nnz, scalar, results, stream);
 }
@@ -120,4 +120,4 @@ void coo_degree_nz(COO<T>* in, int* results, cudaStream_t stream)
 };  // end NAMESPACE sparse
 };  // end NAMESPACE raft
 
-#endif
\ No newline at end of file
+#endif
diff --git a/cpp/include/raft/sparse/linalg/detail/degree.cuh b/cpp/include/raft/sparse/linalg/detail/degree.cuh
index df31192cf7..6338b8eb00 100644
--- a/cpp/include/raft/sparse/linalg/detail/degree.cuh
+++ b/cpp/include/raft/sparse/linalg/detail/degree.cuh
@@ -39,10 +39,10 @@ namespace detail {
  * @param nnz the size of the rows array
  * @param results array to place results
  */
-template <int TPB_X = 64, typename T = int>
-RAFT_KERNEL coo_degree_kernel(const T* rows, int nnz, T* results)
+template <uint64_t TPB_X = 64, typename T = int>
+RAFT_KERNEL coo_degree_kernel(const T* rows, uint64_t nnz, T* results)
 {
-  int row = (blockIdx.x * TPB_X) + threadIdx.x;
+  uint64_t row = (blockIdx.x * TPB_X) + threadIdx.x;
   if (row < nnz) { atomicAdd(results + rows[row], (T)1); }
 }
 
@@ -54,8 +54,8 @@ RAFT_KERNEL coo_degree_kernel(const T* rows, int nnz, T* results)
  * @param results: output result array
  * @param stream: cuda stream to use
  */
-template <int TPB_X = 64, typename T = int>
-void coo_degree(const T* rows, int nnz, T* results, cudaStream_t stream)
+template <uint64_t TPB_X = 64, typename T = int>
+void coo_degree(const T* rows, uint64_t nnz, T* results, cudaStream_t stream)
 {
   dim3 grid_rc(raft::ceildiv(nnz, TPB_X), 1, 1);
   dim3 blk_rc(TPB_X, 1, 1);
@@ -71,11 +71,11 @@ RAFT_KERNEL coo_degree_nz_kernel(const int* rows, const T* vals, int nnz, int* r
   if (row < nnz && vals[row] != 0.0) { raft::myAtomicAdd(results + rows[row], 1); }
 }
 
-template <int TPB_X = 64, typename T>
+template <uint64_t TPB_X = 64, typename T>
 RAFT_KERNEL coo_degree_scalar_kernel(
-  const int* rows, const T* vals, int nnz, T scalar, int* results)
+  const int* rows, const T* vals, uint64_t nnz, T scalar, int* results)
 {
-  int row = (blockIdx.x * TPB_X) + threadIdx.x;
+  uint64_t row = (blockIdx.x * TPB_X) + threadIdx.x;
   if (row < nnz && vals[row] != scalar) { raft::myAtomicAdd(results + rows[row], 1); }
 }
 
@@ -90,9 +90,9 @@ RAFT_KERNEL coo_degree_scalar_kernel(
  * @param results: output row counts
  * @param stream: cuda stream to use
  */
-template <int TPB_X = 64, typename T>
+template <uint64_t TPB_X = 64, typename T>
 void coo_degree_scalar(
-  const int* rows, const T* vals, int nnz, T scalar, int* results, cudaStream_t stream = 0)
+  const int* rows, const T* vals, uint64_t nnz, T scalar, int* results, cudaStream_t stream = 0)
 {
   dim3 grid_rc(raft::ceildiv(nnz, TPB_X), 1, 1);
   dim3 blk_rc(TPB_X, 1, 1);
diff --git a/cpp/include/raft/sparse/linalg/detail/norm.cuh b/cpp/include/raft/sparse/linalg/detail/norm.cuh
index 3702111f83..5dc5b2db41 100644
--- a/cpp/include/raft/sparse/linalg/detail/norm.cuh
+++ b/cpp/include/raft/sparse/linalg/detail/norm.cuh
@@ -40,14 +40,14 @@ namespace sparse {
 namespace linalg {
 namespace detail {
 
-template <int TPB_X = 64, typename T>
+template <int TPB_X = 64, typename T, typename indT>
 RAFT_KERNEL csr_row_normalize_l1_kernel(
   // @TODO: This can be done much more parallel by
   // having threads in a warp compute the sum in parallel
   // over each row and then divide the values in parallel.
-  const int* ia,  // csr row ex_scan (sorted by row)
+  const indT* ia,  // csr row ex_scan (sorted by row)
   const T* vals,
-  int nnz,  // array of values and number of non-zeros
+  indT nnz,  // array of values and number of non-zeros
   int m,    // num rows in csr
   T* result)
 {  // output array
@@ -57,19 +57,19 @@ RAFT_KERNEL csr_row_normalize_l1_kernel(
 
   // sum all vals_arr for row and divide each val by sum
   if (row < m) {
-    int start_idx = ia[row];
-    int stop_idx  = 0;
+    indT start_idx = ia[row];
+    indT stop_idx  = 0;
     if (row < m - 1) {
       stop_idx = ia[row + 1];
     } else
       stop_idx = nnz;
 
     T sum = T(0.0);
-    for (int j = start_idx; j < stop_idx; j++) {
+    for (indT j = start_idx; j < stop_idx; j++) {
       sum = sum + fabs(vals[j]);
     }
 
-    for (int j = start_idx; j < stop_idx; j++) {
+    for (indT j = start_idx; j < stop_idx; j++) {
       if (sum != 0.0) {
         T val     = vals[j];
         result[j] = val / sum;
@@ -90,10 +90,10 @@ RAFT_KERNEL csr_row_normalize_l1_kernel(
  * @param result: l1 normalized data array
  * @param stream: cuda stream to use
  */
-template <int TPB_X = 64, typename T>
-void csr_row_normalize_l1(const int* ia,  // csr row ex_scan (sorted by row)
+template <int TPB_X = 64, typename T, typename indT>
+void csr_row_normalize_l1(const indT* ia,  // csr row ex_scan (sorted by row)
                           const T* vals,
-                          int nnz,  // array of values and number of non-zeros
+                          indT nnz,  // array of values and number of non-zeros
                           int m,    // num rows in csr
                           T* result,
                           cudaStream_t stream)
@@ -232,4 +232,4 @@ void rowNormCsrCaller(const IdxType* ia,
 };  // end NAMESPACE detail
 };  // end NAMESPACE linalg
 };  // end NAMESPACE sparse
-};  // end NAMESPACE raft
\ No newline at end of file
+};  // end NAMESPACE raft
diff --git a/cpp/include/raft/sparse/linalg/detail/symmetrize.cuh b/cpp/include/raft/sparse/linalg/detail/symmetrize.cuh
index d343bcbf66..af0b6feb37 100644
--- a/cpp/include/raft/sparse/linalg/detail/symmetrize.cuh
+++ b/cpp/include/raft/sparse/linalg/detail/symmetrize.cuh
@@ -48,7 +48,7 @@ namespace detail {
 // TODO: value_idx param needs to be used for this once FAISS is updated to use float32
 // for indices so that the index types can be uniform
 template <int TPB_X = 128, typename T, typename Lambda>
-RAFT_KERNEL coo_symmetrize_kernel(int* row_ind,
+RAFT_KERNEL coo_symmetrize_kernel(uint64_t* row_ind,
                                   int* rows,
                                   int* cols,
                                   T* vals,
@@ -56,17 +56,17 @@ RAFT_KERNEL coo_symmetrize_kernel(int* row_ind,
                                   int* ocols,
                                   T* ovals,
                                   int n,
-                                  int cnnz,
+                                  uint64_t cnnz,
                                   Lambda reduction_op)
 {
   int row = (blockIdx.x * TPB_X) + threadIdx.x;
 
   if (row < n) {
-    int start_idx = row_ind[row];  // each thread processes one row
-    int stop_idx  = get_stop_idx(row, n, cnnz, row_ind);
+    uint64_t start_idx = row_ind[row];  // each thread processes one row
+    uint64_t stop_idx  = get_stop_idx(row, n, cnnz, row_ind);
 
     int row_nnz       = 0;
-    int out_start_idx = start_idx * 2;
+    uint64_t out_start_idx = start_idx * 2;
 
     for (int idx = 0; idx < stop_idx - start_idx; idx++) {
       int cur_row = rows[idx + start_idx];
@@ -74,13 +74,13 @@ RAFT_KERNEL coo_symmetrize_kernel(int* row_ind,
       T cur_val   = vals[idx + start_idx];
 
       int lookup_row = cur_col;
-      int t_start    = row_ind[lookup_row];  // Start at
-      int t_stop     = get_stop_idx(lookup_row, n, cnnz, row_ind);
+      uint64_t t_start    = row_ind[lookup_row];  // Start at
+      uint64_t t_stop     = get_stop_idx(lookup_row, n, cnnz, row_ind);
 
       T transpose = 0.0;
 
       bool found_match = false;
-      for (int t_idx = t_start; t_idx < t_stop; t_idx++) {
+      for (uint64_t t_idx = t_start; t_idx < t_stop; t_idx++) {
         // If we find a match, let's get out of the loop. We won't
         // need to modify the transposed value, since that will be
         // done in a different thread.
@@ -142,7 +142,7 @@ void coo_symmetrize(COO<T>* in,
 
   ASSERT(!out->validate_mem(), "Expecting unallocated COO for output");
 
-  rmm::device_uvector<int> in_row_ind(in->n_rows, stream);
+  rmm::device_uvector<uint64_t> in_row_ind(in->n_rows, stream);
 
   convert::sorted_coo_to_csr(in, in_row_ind.data(), stream);
 
diff --git a/cpp/include/raft/sparse/linalg/norm.cuh b/cpp/include/raft/sparse/linalg/norm.cuh
index 43dd182fe5..ea325c3472 100644
--- a/cpp/include/raft/sparse/linalg/norm.cuh
+++ b/cpp/include/raft/sparse/linalg/norm.cuh
@@ -36,10 +36,10 @@ namespace linalg {
  * @param result: l1 normalized data array
  * @param stream: cuda stream to use
  */
-template <typename T>
-void csr_row_normalize_l1(const int* ia,  // csr row ex_scan (sorted by row)
+template <typename T, typename indT>
+void csr_row_normalize_l1(const indT* ia,  // csr row ex_scan (sorted by row)
                           const T* vals,
-                          int nnz,  // array of values and number of non-zeros
+                          indT nnz,  // array of values and number of non-zeros
                           int m,    // num rows in csr
                           T* result,
                           cudaStream_t stream)
@@ -104,4 +104,4 @@ void rowNormCsr(raft::resources const& handle,
 };  // end NAMESPACE sparse
 };  // end NAMESPACE raft
 
-#endif
\ No newline at end of file
+#endif
diff --git a/cpp/include/raft/sparse/neighbors/detail/cross_component_nn.cuh b/cpp/include/raft/sparse/neighbors/detail/cross_component_nn.cuh
index a47d5a6f34..1247b91d2e 100644
--- a/cpp/include/raft/sparse/neighbors/detail/cross_component_nn.cuh
+++ b/cpp/include/raft/sparse/neighbors/detail/cross_component_nn.cuh
@@ -242,7 +242,7 @@ void perform_1nn(raft::resources const& handle,
   // the color components.
   auto colors_group_idxs = raft::make_device_vector<value_idx, value_idx>(handle, n_components + 1);
   raft::sparse::convert::sorted_coo_to_csr(
-    colors, n_rows, colors_group_idxs.data_handle(), n_components + 1, stream);
+    colors, (value_idx)n_rows, colors_group_idxs.data_handle(), n_components + 1, stream);
 
   auto group_idxs_view = raft::make_device_vector_view<const value_idx, value_idx>(
     colors_group_idxs.data_handle() + 1, n_components);
diff --git a/cpp/include/raft/sparse/op/detail/filter.cuh b/cpp/include/raft/sparse/op/detail/filter.cuh
index 3df85e6871..700885240c 100644
--- a/cpp/include/raft/sparse/op/detail/filter.cuh
+++ b/cpp/include/raft/sparse/op/detail/filter.cuh
@@ -42,27 +42,27 @@ namespace sparse {
 namespace op {
 namespace detail {
 
-template <int TPB_X, typename T>
+template <uint64_t TPB_X, typename T>
 RAFT_KERNEL coo_remove_scalar_kernel(const int* rows,
                                      const int* cols,
                                      const T* vals,
-                                     int nnz,
+                                     uint64_t nnz,
                                      int* crows,
                                      int* ccols,
                                      T* cvals,
-                                     int* ex_scan,
-                                     int* cur_ex_scan,
+                                     uint64_t* ex_scan,
+                                     uint64_t* cur_ex_scan,
                                      int m,
                                      T scalar)
 {
   int row = (blockIdx.x * TPB_X) + threadIdx.x;
 
   if (row < m) {
-    int start       = cur_ex_scan[row];
-    int stop        = get_stop_idx(row, m, nnz, cur_ex_scan);
-    int cur_out_idx = ex_scan[row];
+    uint64_t start       = cur_ex_scan[row];
+    uint64_t stop        = get_stop_idx(row, m, nnz, cur_ex_scan);
+    uint64_t cur_out_idx = ex_scan[row];
 
-    for (int idx = start; idx < stop; idx++) {
+    for (uint64_t idx = start; idx < stop; idx++) {
       if (vals[idx] != scalar) {
         crows[cur_out_idx] = rows[idx];
         ccols[cur_out_idx] = cols[idx];
@@ -94,7 +94,7 @@ template <int TPB_X, typename T>
 void coo_remove_scalar(const int* rows,
                        const int* cols,
                        const T* vals,
-                       int nnz,
+                       uint64_t nnz,
                        int* crows,
                        int* ccols,
                        T* cvals,
@@ -104,19 +104,19 @@ void coo_remove_scalar(const int* rows,
                        int n,
                        cudaStream_t stream)
 {
-  rmm::device_uvector<int> ex_scan(n, stream);
-  rmm::device_uvector<int> cur_ex_scan(n, stream);
+  rmm::device_uvector<uint64_t> ex_scan(n, stream);
+  rmm::device_uvector<uint64_t> cur_ex_scan(n, stream);
 
-  RAFT_CUDA_TRY(cudaMemsetAsync(ex_scan.data(), 0, n * sizeof(int), stream));
-  RAFT_CUDA_TRY(cudaMemsetAsync(cur_ex_scan.data(), 0, n * sizeof(int), stream));
+  RAFT_CUDA_TRY(cudaMemsetAsync(ex_scan.data(), 0, (uint64_t)n * sizeof(int), stream));
+  RAFT_CUDA_TRY(cudaMemsetAsync(cur_ex_scan.data(), 0, (uint64_t)n * sizeof(int), stream));
 
   thrust::device_ptr<int> dev_cnnz    = thrust::device_pointer_cast(cnnz);
-  thrust::device_ptr<int> dev_ex_scan = thrust::device_pointer_cast(ex_scan.data());
+  thrust::device_ptr<uint64_t> dev_ex_scan = thrust::device_pointer_cast(ex_scan.data());
   thrust::exclusive_scan(rmm::exec_policy(stream), dev_cnnz, dev_cnnz + n, dev_ex_scan);
   RAFT_CUDA_TRY(cudaPeekAtLastError());
 
   thrust::device_ptr<int> dev_cur_cnnz    = thrust::device_pointer_cast(cur_cnnz);
-  thrust::device_ptr<int> dev_cur_ex_scan = thrust::device_pointer_cast(cur_ex_scan.data());
+  thrust::device_ptr<uint64_t> dev_cur_ex_scan = thrust::device_pointer_cast(cur_ex_scan.data());
   thrust::exclusive_scan(rmm::exec_policy(stream), dev_cur_cnnz, dev_cur_cnnz + n, dev_cur_ex_scan);
   RAFT_CUDA_TRY(cudaPeekAtLastError());
 
@@ -151,8 +151,8 @@ void coo_remove_scalar(COO<T>* in, COO<T>* out, T scalar, cudaStream_t stream)
   rmm::device_uvector<int> row_count_nz(in->n_rows, stream);
   rmm::device_uvector<int> row_count(in->n_rows, stream);
 
-  RAFT_CUDA_TRY(cudaMemsetAsync(row_count_nz.data(), 0, in->n_rows * sizeof(int), stream));
-  RAFT_CUDA_TRY(cudaMemsetAsync(row_count.data(), 0, in->n_rows * sizeof(int), stream));
+  RAFT_CUDA_TRY(cudaMemsetAsync(row_count_nz.data(), 0, (uint64_t)in->n_rows * sizeof(int), stream));
+  RAFT_CUDA_TRY(cudaMemsetAsync(row_count.data(), 0, (uint64_t)in->n_rows * sizeof(int), stream));
 
   linalg::coo_degree(in->rows(), in->nnz, row_count.data(), stream);
   RAFT_CUDA_TRY(cudaPeekAtLastError());
@@ -161,8 +161,7 @@ void coo_remove_scalar(COO<T>* in, COO<T>* out, T scalar, cudaStream_t stream)
   RAFT_CUDA_TRY(cudaPeekAtLastError());
 
   thrust::device_ptr<int> d_row_count_nz = thrust::device_pointer_cast(row_count_nz.data());
-  int out_nnz =
-    thrust::reduce(rmm::exec_policy(stream), d_row_count_nz, d_row_count_nz + in->n_rows);
+  uint64_t out_nnz = thrust::reduce(rmm::exec_policy(stream), d_row_count_nz, d_row_count_nz + in->n_rows, (uint64_t)0);
 
   out->allocate(out_nnz, in->n_rows, in->n_cols, false, stream);
 
diff --git a/cpp/include/raft/sparse/op/detail/sort.h b/cpp/include/raft/sparse/op/detail/sort.h
index 02287c2367..7d09ebeddc 100644
--- a/cpp/include/raft/sparse/op/detail/sort.h
+++ b/cpp/include/raft/sparse/op/detail/sort.h
@@ -69,7 +69,7 @@ struct TupleComp {
  * @param stream: cuda stream to use
  */
 template <typename T, typename IdxT = int>
-void coo_sort(IdxT m, IdxT n, IdxT nnz, IdxT* rows, IdxT* cols, T* vals, cudaStream_t stream)
+void coo_sort(IdxT m, IdxT n, uint64_t nnz, IdxT* rows, IdxT* cols, T* vals, cudaStream_t stream)
 {
   auto coo_indices = thrust::make_zip_iterator(thrust::make_tuple(rows, cols));
 
diff --git a/cpp/include/raft/sparse/op/sort.cuh b/cpp/include/raft/sparse/op/sort.cuh
index 5b8a792429..35b5fd9f31 100644
--- a/cpp/include/raft/sparse/op/sort.cuh
+++ b/cpp/include/raft/sparse/op/sort.cuh
@@ -38,7 +38,7 @@ namespace op {
  * @param stream: cuda stream to use
  */
 template <typename T, typename IdxT = int>
-void coo_sort(IdxT m, IdxT n, IdxT nnz, IdxT* rows, IdxT* cols, T* vals, cudaStream_t stream)
+void coo_sort(IdxT m, IdxT n, uint64_t nnz, IdxT* rows, IdxT* cols, T* vals, cudaStream_t stream)
 {
   detail::coo_sort(m, n, nnz, rows, cols, vals, stream);
 }
diff --git a/cpp/include/raft/spatial/knn/detail/ball_cover.cuh b/cpp/include/raft/spatial/knn/detail/ball_cover.cuh
index c4ca2ffa61..f436d5c740 100644
--- a/cpp/include/raft/spatial/knn/detail/ball_cover.cuh
+++ b/cpp/include/raft/spatial/knn/detail/ball_cover.cuh
@@ -161,7 +161,7 @@ void construct_landmark_1nn(raft::resources const& handle,
 
   // convert to CSR for fast lookup
   raft::sparse::convert::sorted_coo_to_csr(R_1nn_inds.data(),
-                                           index.m,
+                                           (value_idx)index.m,
                                            index.get_R_indptr().data_handle(),
                                            index.n_landmarks + 1,
                                            resource::get_cuda_stream(handle));
diff --git a/cpp/test/sparse/solver/lanczos.cu b/cpp/test/sparse/solver/lanczos.cu
index 128ab73747..23b3a7ff99 100644
--- a/cpp/test/sparse/solver/lanczos.cu
+++ b/cpp/test/sparse/solver/lanczos.cu
@@ -173,7 +173,7 @@ class rmat_lanczos_tests
       raft::make_device_vector<IndexType, uint32_t, raft::row_major>(handle,
                                                                      symmetric_coo.n_rows + 1);
     raft::sparse::convert::sorted_coo_to_csr(symmetric_coo.rows(),
-                                             symmetric_coo.nnz,
+                                             (IndexType)symmetric_coo.nnz,
                                              row_indices.data_handle(),
                                              symmetric_coo.n_rows + 1,
                                              stream);
@@ -198,7 +198,7 @@ class rmat_lanczos_tests
       symmetric_coo.cols(),
       symmetric_coo.vals(),
       symmetric_coo.n_rows,
-      symmetric_coo.nnz};
+      (IndexType)symmetric_coo.nnz};
     raft::sparse::solver::lanczos_solver_config<ValueType> config{
       n_components, params.maxiter, params.restartiter, params.tol, rng.seed};
 
diff --git a/cpp/test/sparse/symmetrize.cu b/cpp/test/sparse/symmetrize.cu
index e1a74dc40b..c3a03a942c 100644
--- a/cpp/test/sparse/symmetrize.cu
+++ b/cpp/test/sparse/symmetrize.cu
@@ -109,8 +109,8 @@ class SparseSymmetrizeTest
     rmm::device_scalar<value_idx> sum(stream);
     sum.set_value_to_zero_async(stream);
 
-    assert_symmetry<<<raft::ceildiv(out.nnz, 256), 256, 0, stream>>>(
-      out.rows(), out.cols(), out.vals(), out.nnz, sum.data());
+    assert_symmetry<<<raft::ceildiv(out.nnz, (uint64_t)256), 256, 0, stream>>>(
+      out.rows(), out.cols(), out.vals(), (value_idx)out.nnz, sum.data());
 
     sum_h = sum.value(stream);
     resource::sync_stream(handle, stream);

From 3cd388069cbbec7dee5bd19435cea03a9265a79d Mon Sep 17 00:00:00 2001
From: viclafargue <viclafargue@nvidia.com>
Date: Tue, 21 Jan 2025 17:04:30 +0000
Subject: [PATCH 02/25] Additionnal fixes

---
 .../raft/sparse/convert/detail/csr.cuh        |  7 +-
 cpp/include/raft/sparse/linalg/degree.cuh     | 16 ++--
 .../raft/sparse/linalg/detail/degree.cuh      | 20 ++--
 .../raft/sparse/linalg/detail/spectral.cuh    |  4 +-
 .../raft/sparse/linalg/detail/symmetrize.cuh  | 14 +--
 cpp/include/raft/sparse/linalg/spectral.cuh   |  2 +-
 cpp/include/raft/sparse/op/detail/filter.cuh  | 95 ++++++++-----------
 .../raft/sparse/solver/detail/lanczos.cuh     |  6 +-
 8 files changed, 76 insertions(+), 88 deletions(-)

diff --git a/cpp/include/raft/sparse/convert/detail/csr.cuh b/cpp/include/raft/sparse/convert/detail/csr.cuh
index 79ed0d36ff..aac5b26786 100644
--- a/cpp/include/raft/sparse/convert/detail/csr.cuh
+++ b/cpp/include/raft/sparse/convert/detail/csr.cuh
@@ -87,14 +87,13 @@ void coo_to_csr(raft::resources const& handle,
 template <typename T, typename outT>
 void sorted_coo_to_csr(const T* rows, outT nnz, outT* row_ind, int m, cudaStream_t stream)
 {
-  rmm::device_uvector<T> row_counts(m, stream);
-
-  RAFT_CUDA_TRY(cudaMemsetAsync(row_counts.data(), 0, (uint64_t)m * sizeof(T), stream));
+  rmm::device_uvector<outT> row_counts(m, stream);
+  RAFT_CUDA_TRY(cudaMemsetAsync(row_counts.data(), 0, (uint64_t)m * sizeof(outT), stream));
 
   linalg::coo_degree(rows, nnz, row_counts.data(), stream);
 
   // create csr compressed row index from row counts
-  thrust::device_ptr<T> row_counts_d = thrust::device_pointer_cast(row_counts.data());
+  thrust::device_ptr<outT> row_counts_d = thrust::device_pointer_cast(row_counts.data());
   thrust::device_ptr<outT> c_ind_d      = thrust::device_pointer_cast(row_ind);
   exclusive_scan(rmm::exec_policy(stream), row_counts_d, row_counts_d + m, c_ind_d);
 }
diff --git a/cpp/include/raft/sparse/linalg/degree.cuh b/cpp/include/raft/sparse/linalg/degree.cuh
index 5da4c9a30d..a154c2357c 100644
--- a/cpp/include/raft/sparse/linalg/degree.cuh
+++ b/cpp/include/raft/sparse/linalg/degree.cuh
@@ -33,8 +33,8 @@ namespace linalg {
  * @param results: output result array
  * @param stream: cuda stream to use
  */
-template <typename T = int>
-void coo_degree(const T* rows, uint64_t nnz, T* results, cudaStream_t stream)
+template <typename T = int, typename outT>
+void coo_degree(const T* rows, uint64_t nnz, outT* results, cudaStream_t stream)
 {
   detail::coo_degree<64, T>(rows, nnz, results, stream);
 }
@@ -47,8 +47,8 @@ void coo_degree(const T* rows, uint64_t nnz, T* results, cudaStream_t stream)
  * @param results: output array with row counts (size=in->n_rows)
  * @param stream: cuda stream to use
  */
-template <typename T>
-void coo_degree(COO<T>* in, int* results, cudaStream_t stream)
+template <typename T, typename outT>
+void coo_degree(COO<T>* in, outT* results, cudaStream_t stream)
 {
   coo_degree(in->rows(), in->nnz, results, stream);
 }
@@ -64,9 +64,9 @@ void coo_degree(COO<T>* in, int* results, cudaStream_t stream)
  * @param results: output row counts
  * @param stream: cuda stream to use
  */
-template <typename T>
+template <typename T, typename outT>
 void coo_degree_scalar(
-  const int* rows, const T* vals, uint64_t nnz, T scalar, int* results, cudaStream_t stream = 0)
+  const int* rows, const T* vals, uint64_t nnz, T scalar, outT* results, cudaStream_t stream = 0)
 {
   detail::coo_degree_scalar<64>(rows, vals, nnz, scalar, results, stream);
 }
@@ -80,8 +80,8 @@ void coo_degree_scalar(
  * @param results: output row counts
  * @param stream: cuda stream to use
  */
-template <typename T>
-void coo_degree_scalar(COO<T>* in, T scalar, int* results, cudaStream_t stream)
+template <typename T, typename outT>
+void coo_degree_scalar(COO<T>* in, T scalar, outT* results, cudaStream_t stream)
 {
   coo_degree_scalar(in->rows(), in->vals(), in->nnz, scalar, results, stream);
 }
diff --git a/cpp/include/raft/sparse/linalg/detail/degree.cuh b/cpp/include/raft/sparse/linalg/detail/degree.cuh
index 6338b8eb00..d7f72e9b6c 100644
--- a/cpp/include/raft/sparse/linalg/detail/degree.cuh
+++ b/cpp/include/raft/sparse/linalg/detail/degree.cuh
@@ -39,11 +39,11 @@ namespace detail {
  * @param nnz the size of the rows array
  * @param results array to place results
  */
-template <uint64_t TPB_X = 64, typename T = int>
-RAFT_KERNEL coo_degree_kernel(const T* rows, uint64_t nnz, T* results)
+template <uint64_t TPB_X = 64, typename T = int, typename outT>
+RAFT_KERNEL coo_degree_kernel(const T* rows, uint64_t nnz, outT* results)
 {
   uint64_t row = (blockIdx.x * TPB_X) + threadIdx.x;
-  if (row < nnz) { atomicAdd(results + rows[row], (T)1); }
+  if (row < nnz) { atomicAdd(results + rows[row], (outT)1); }
 }
 
 /**
@@ -54,8 +54,8 @@ RAFT_KERNEL coo_degree_kernel(const T* rows, uint64_t nnz, T* results)
  * @param results: output result array
  * @param stream: cuda stream to use
  */
-template <uint64_t TPB_X = 64, typename T = int>
-void coo_degree(const T* rows, uint64_t nnz, T* results, cudaStream_t stream)
+template <uint64_t TPB_X = 64, typename T = int, typename outT>
+void coo_degree(const T* rows, uint64_t nnz, outT* results, cudaStream_t stream)
 {
   dim3 grid_rc(raft::ceildiv(nnz, TPB_X), 1, 1);
   dim3 blk_rc(TPB_X, 1, 1);
@@ -71,12 +71,12 @@ RAFT_KERNEL coo_degree_nz_kernel(const int* rows, const T* vals, int nnz, int* r
   if (row < nnz && vals[row] != 0.0) { raft::myAtomicAdd(results + rows[row], 1); }
 }
 
-template <uint64_t TPB_X = 64, typename T>
+template <uint64_t TPB_X = 64, typename T, typename outT>
 RAFT_KERNEL coo_degree_scalar_kernel(
-  const int* rows, const T* vals, uint64_t nnz, T scalar, int* results)
+  const int* rows, const T* vals, uint64_t nnz, T scalar, outT* results)
 {
   uint64_t row = (blockIdx.x * TPB_X) + threadIdx.x;
-  if (row < nnz && vals[row] != scalar) { raft::myAtomicAdd(results + rows[row], 1); }
+  if (row < nnz && vals[row] != scalar) { raft::myAtomicAdd((outT*)results + rows[row], (outT)1); }
 }
 
 /**
@@ -90,9 +90,9 @@ RAFT_KERNEL coo_degree_scalar_kernel(
  * @param results: output row counts
  * @param stream: cuda stream to use
  */
-template <uint64_t TPB_X = 64, typename T>
+template <uint64_t TPB_X = 64, typename T, typename outT>
 void coo_degree_scalar(
-  const int* rows, const T* vals, uint64_t nnz, T scalar, int* results, cudaStream_t stream = 0)
+  const int* rows, const T* vals, uint64_t nnz, T scalar, outT* results, cudaStream_t stream = 0)
 {
   dim3 grid_rc(raft::ceildiv(nnz, TPB_X), 1, 1);
   dim3 blk_rc(TPB_X, 1, 1);
diff --git a/cpp/include/raft/sparse/linalg/detail/spectral.cuh b/cpp/include/raft/sparse/linalg/detail/spectral.cuh
index a1642d1455..ad2a567294 100644
--- a/cpp/include/raft/sparse/linalg/detail/spectral.cuh
+++ b/cpp/include/raft/sparse/linalg/detail/spectral.cuh
@@ -35,8 +35,8 @@ void fit_embedding(raft::resources const& handle,
                    int* rows,
                    int* cols,
                    T* vals,
-                   int nnz,
-                   int n,
+                   uint64_t nnz,
+                   uint64_t n,
                    int n_components,
                    T* out,
                    unsigned long long seed = 1234567)
diff --git a/cpp/include/raft/sparse/linalg/detail/symmetrize.cuh b/cpp/include/raft/sparse/linalg/detail/symmetrize.cuh
index af0b6feb37..550a3e9505 100644
--- a/cpp/include/raft/sparse/linalg/detail/symmetrize.cuh
+++ b/cpp/include/raft/sparse/linalg/detail/symmetrize.cuh
@@ -65,13 +65,13 @@ RAFT_KERNEL coo_symmetrize_kernel(uint64_t* row_ind,
     uint64_t start_idx = row_ind[row];  // each thread processes one row
     uint64_t stop_idx  = get_stop_idx(row, n, cnnz, row_ind);
 
-    int row_nnz       = 0;
+    uint64_t row_nnz       = 0;
     uint64_t out_start_idx = start_idx * 2;
 
-    for (int idx = 0; idx < stop_idx - start_idx; idx++) {
-      int cur_row = rows[idx + start_idx];
-      int cur_col = cols[idx + start_idx];
-      T cur_val   = vals[idx + start_idx];
+    for (uint64_t idx = 0; idx < stop_idx - start_idx; idx++) {
+      int cur_row = rows[start_idx + idx];
+      int cur_col = cols[start_idx + idx];
+      T cur_val   = vals[start_idx + idx];
 
       int lookup_row = cur_col;
       uint64_t t_start    = row_ind[lookup_row];  // Start at
@@ -104,7 +104,7 @@ RAFT_KERNEL coo_symmetrize_kernel(uint64_t* row_ind,
       // Note that if we did find a match, we don't need to
       // compute `res` on it here because it will be computed
       // in a different thread.
-      if (!found_match && vals[idx] != 0.0) {
+      if (!found_match && cur_val != 0.0) {
         orows[out_start_idx + row_nnz] = cur_col;
         ocols[out_start_idx + row_nnz] = cur_row;
         ovals[out_start_idx + row_nnz] = res;
@@ -158,7 +158,7 @@ void coo_symmetrize(COO<T>* in,
                                                             in->n_rows,
                                                             in->nnz,
                                                             reduction_op);
-  RAFT_CUDA_TRY(cudaPeekAtLastError());
+    RAFT_CUDA_TRY(cudaPeekAtLastError());
 }
 
 /**
diff --git a/cpp/include/raft/sparse/linalg/spectral.cuh b/cpp/include/raft/sparse/linalg/spectral.cuh
index 4c0595bf91..eb45ab013e 100644
--- a/cpp/include/raft/sparse/linalg/spectral.cuh
+++ b/cpp/include/raft/sparse/linalg/spectral.cuh
@@ -28,7 +28,7 @@ void fit_embedding(raft::resources const& handle,
                    int* rows,
                    int* cols,
                    T* vals,
-                   int nnz,
+                   uint64_t nnz,
                    int n,
                    int n_components,
                    T* out,
diff --git a/cpp/include/raft/sparse/op/detail/filter.cuh b/cpp/include/raft/sparse/op/detail/filter.cuh
index 700885240c..94c314bb67 100644
--- a/cpp/include/raft/sparse/op/detail/filter.cuh
+++ b/cpp/include/raft/sparse/op/detail/filter.cuh
@@ -43,34 +43,36 @@ namespace op {
 namespace detail {
 
 template <uint64_t TPB_X, typename T>
-RAFT_KERNEL coo_remove_scalar_kernel(const int* rows,
-                                     const int* cols,
-                                     const T* vals,
+RAFT_KERNEL coo_remove_scalar_kernel(const int* in_rows,
+                                     const int* in_cols,
+                                     const T* in_vals,
                                      uint64_t nnz,
-                                     int* crows,
-                                     int* ccols,
-                                     T* cvals,
-                                     uint64_t* ex_scan,
-                                     uint64_t* cur_ex_scan,
-                                     int m,
-                                     T scalar)
+                                     int* out_rows,
+                                     int* out_cols,
+                                     T* out_vals,
+                                     uint64_t* row_indices,
+				                             int* rows_lenght_acc,
+                                     T scalar,
+				                             int n_rows)
 {
-  int row = (blockIdx.x * TPB_X) + threadIdx.x;
-
-  if (row < m) {
-    uint64_t start       = cur_ex_scan[row];
-    uint64_t stop        = get_stop_idx(row, m, nnz, cur_ex_scan);
-    uint64_t cur_out_idx = ex_scan[row];
-
-    for (uint64_t idx = start; idx < stop; idx++) {
-      if (vals[idx] != scalar) {
-        crows[cur_out_idx] = rows[idx];
-        ccols[cur_out_idx] = cols[idx];
-        cvals[cur_out_idx] = vals[idx];
-        ++cur_out_idx;
-      }
-    }
-  }
+  uint64_t in_idx = (blockIdx.x * TPB_X) + threadIdx.x;
+
+  if (in_idx >= nnz)
+    return;
+
+  int val = in_vals[in_idx];
+
+  if (val == scalar)
+    return;
+
+  int row = in_rows[in_idx];
+
+  uint64_t row_start_index = row_indices[row];
+  uint64_t out_idx = row_start_index + atomicAdd(rows_lenght_acc + row, 1);
+
+  out_rows[out_idx] = row;
+  out_cols[out_idx] = in_cols[in_idx];
+  out_vals[out_idx] = val;
 }
 
 /**
@@ -90,7 +92,7 @@ RAFT_KERNEL coo_remove_scalar_kernel(const int* rows,
  * @param d_alloc device allocator for temporary buffers
  * @param stream: cuda stream to use
  */
-template <int TPB_X, typename T>
+template <uint64_t TPB_X, typename T>
 void coo_remove_scalar(const int* rows,
                        const int* cols,
                        const T* vals,
@@ -98,29 +100,23 @@ void coo_remove_scalar(const int* rows,
                        int* crows,
                        int* ccols,
                        T* cvals,
-                       int* cnnz,
-                       int* cur_cnnz,
+                       uint64_t* cnnz,
                        T scalar,
                        int n,
                        cudaStream_t stream)
 {
   rmm::device_uvector<uint64_t> ex_scan(n, stream);
-  rmm::device_uvector<uint64_t> cur_ex_scan(n, stream);
-
-  RAFT_CUDA_TRY(cudaMemsetAsync(ex_scan.data(), 0, (uint64_t)n * sizeof(int), stream));
-  RAFT_CUDA_TRY(cudaMemsetAsync(cur_ex_scan.data(), 0, (uint64_t)n * sizeof(int), stream));
+  RAFT_CUDA_TRY(cudaMemsetAsync(ex_scan.data(), 0, (uint64_t)n * sizeof(uint64_t), stream));
 
-  thrust::device_ptr<int> dev_cnnz    = thrust::device_pointer_cast(cnnz);
+  thrust::device_ptr<uint64_t> dev_cnnz    = thrust::device_pointer_cast(cnnz);
   thrust::device_ptr<uint64_t> dev_ex_scan = thrust::device_pointer_cast(ex_scan.data());
   thrust::exclusive_scan(rmm::exec_policy(stream), dev_cnnz, dev_cnnz + n, dev_ex_scan);
   RAFT_CUDA_TRY(cudaPeekAtLastError());
 
-  thrust::device_ptr<int> dev_cur_cnnz    = thrust::device_pointer_cast(cur_cnnz);
-  thrust::device_ptr<uint64_t> dev_cur_ex_scan = thrust::device_pointer_cast(cur_ex_scan.data());
-  thrust::exclusive_scan(rmm::exec_policy(stream), dev_cur_cnnz, dev_cur_cnnz + n, dev_cur_ex_scan);
-  RAFT_CUDA_TRY(cudaPeekAtLastError());
+  rmm::device_uvector<int> rows_length_acc(n, stream);
+  RAFT_CUDA_TRY(cudaMemsetAsync(rows_length_acc.data(), 0, (uint64_t)n * sizeof(int), stream));
 
-  dim3 grid(raft::ceildiv(n, TPB_X), 1, 1);
+  dim3 grid(raft::ceildiv(nnz, TPB_X), 1, 1);
   dim3 blk(TPB_X, 1, 1);
 
   coo_remove_scalar_kernel<TPB_X><<<grid, blk, 0, stream>>>(rows,
@@ -131,9 +127,9 @@ void coo_remove_scalar(const int* rows,
                                                             ccols,
                                                             cvals,
                                                             dev_ex_scan.get(),
-                                                            dev_cur_ex_scan.get(),
-                                                            n,
-                                                            scalar);
+							                                              rows_length_acc.data(),
+                                                            scalar,
+							                                              n);
   RAFT_CUDA_TRY(cudaPeekAtLastError());
 }
 
@@ -148,19 +144,13 @@ void coo_remove_scalar(const int* rows,
 template <int TPB_X, typename T>
 void coo_remove_scalar(COO<T>* in, COO<T>* out, T scalar, cudaStream_t stream)
 {
-  rmm::device_uvector<int> row_count_nz(in->n_rows, stream);
-  rmm::device_uvector<int> row_count(in->n_rows, stream);
-
-  RAFT_CUDA_TRY(cudaMemsetAsync(row_count_nz.data(), 0, (uint64_t)in->n_rows * sizeof(int), stream));
-  RAFT_CUDA_TRY(cudaMemsetAsync(row_count.data(), 0, (uint64_t)in->n_rows * sizeof(int), stream));
-
-  linalg::coo_degree(in->rows(), in->nnz, row_count.data(), stream);
-  RAFT_CUDA_TRY(cudaPeekAtLastError());
+  rmm::device_uvector<uint64_t> row_count_nz(in->n_rows, stream);
+  RAFT_CUDA_TRY(cudaMemsetAsync(row_count_nz.data(), 0, (uint64_t)in->n_rows * sizeof(uint64_t), stream));
 
-  linalg::coo_degree_scalar(in->rows(), in->vals(), in->nnz, scalar, row_count_nz.data(), stream);
+  linalg::coo_degree_scalar(in->rows(), in->vals(), in->nnz, scalar, (unsigned long long int*)row_count_nz.data(), stream);
   RAFT_CUDA_TRY(cudaPeekAtLastError());
 
-  thrust::device_ptr<int> d_row_count_nz = thrust::device_pointer_cast(row_count_nz.data());
+  thrust::device_ptr<uint64_t> d_row_count_nz = thrust::device_pointer_cast(row_count_nz.data());
   uint64_t out_nnz = thrust::reduce(rmm::exec_policy(stream), d_row_count_nz, d_row_count_nz + in->n_rows, (uint64_t)0);
 
   out->allocate(out_nnz, in->n_rows, in->n_cols, false, stream);
@@ -173,7 +163,6 @@ void coo_remove_scalar(COO<T>* in, COO<T>* out, T scalar, cudaStream_t stream)
                               out->cols(),
                               out->vals(),
                               row_count_nz.data(),
-                              row_count.data(),
                               scalar,
                               in->n_rows,
                               stream);
diff --git a/cpp/include/raft/sparse/solver/detail/lanczos.cuh b/cpp/include/raft/sparse/solver/detail/lanczos.cuh
index ddfa01731a..e46ea77eaa 100644
--- a/cpp/include/raft/sparse/solver/detail/lanczos.cuh
+++ b/cpp/include/raft/sparse/solver/detail/lanczos.cuh
@@ -151,7 +151,7 @@ int performLanczosIteration(raft::resources const& handle,
 
   RAFT_EXPECTS(A != nullptr, "Null matrix pointer.");
 
-  index_type_t n = A->nrows_;
+  uint64_t n = A->nrows_;
 
   // -------------------------------------------------------
   // Compute second Lanczos vector
@@ -814,7 +814,7 @@ int computeSmallestEigenvectors(
   constexpr value_type_t zero = 0;
 
   // Matrix dimension
-  index_type_t n = A->nrows_;
+  uint64_t n = A->nrows_;
 
   // Shift for implicit restart
   value_type_t shiftUpper;
@@ -1160,7 +1160,7 @@ int computeLargestEigenvectors(
   constexpr value_type_t zero = 0;
 
   // Matrix dimension
-  index_type_t n = A->nrows_;
+  uint64_t n = A->nrows_;
 
   // Lanczos iteration counters
   index_type_t maxIter_curr = restartIter;  // Maximum size of Lanczos system

From 347ea4e45582b5854e9b07d4450f8a23c6c4648d Mon Sep 17 00:00:00 2001
From: viclafargue <viclafargue@nvidia.com>
Date: Wed, 22 Jan 2025 15:27:08 +0000
Subject: [PATCH 03/25] Revert coo_remove_scalar_kernel code

---
 cpp/include/raft/sparse/op/detail/filter.cuh | 78 +++++++++++---------
 1 file changed, 44 insertions(+), 34 deletions(-)

diff --git a/cpp/include/raft/sparse/op/detail/filter.cuh b/cpp/include/raft/sparse/op/detail/filter.cuh
index 94c314bb67..e4a18d48bb 100644
--- a/cpp/include/raft/sparse/op/detail/filter.cuh
+++ b/cpp/include/raft/sparse/op/detail/filter.cuh
@@ -42,37 +42,35 @@ namespace sparse {
 namespace op {
 namespace detail {
 
-template <uint64_t TPB_X, typename T>
-RAFT_KERNEL coo_remove_scalar_kernel(const int* in_rows,
-                                     const int* in_cols,
-                                     const T* in_vals,
+template <int TPB_X, typename T>
+RAFT_KERNEL coo_remove_scalar_kernel(const int* rows,
+                                     const int* cols,
+                                     const T* vals,
                                      uint64_t nnz,
                                      int* out_rows,
                                      int* out_cols,
                                      T* out_vals,
-                                     uint64_t* row_indices,
-				                             int* rows_lenght_acc,
-                                     T scalar,
-				                             int n_rows)
+                                     uint64_t* ex_scan,
+                                     uint64_t* cur_ex_scan,
+                                     int m,
+                                     T scalar)
 {
-  uint64_t in_idx = (blockIdx.x * TPB_X) + threadIdx.x;
-
-  if (in_idx >= nnz)
-    return;
-
-  int val = in_vals[in_idx];
-
-  if (val == scalar)
-    return;
-
-  int row = in_rows[in_idx];
-
-  uint64_t row_start_index = row_indices[row];
-  uint64_t out_idx = row_start_index + atomicAdd(rows_lenght_acc + row, 1);
-
-  out_rows[out_idx] = row;
-  out_cols[out_idx] = in_cols[in_idx];
-  out_vals[out_idx] = val;
+  int row = (blockIdx.x * TPB_X) + threadIdx.x;
+
+  if (row < m) {
+    uint64_t start       = cur_ex_scan[row];
+    uint64_t stop        = get_stop_idx(row, m, nnz, cur_ex_scan);
+    uint64_t cur_out_idx = ex_scan[row];
+
+    for (uint64_t idx = start; idx < stop; idx++) {
+      if (vals[idx] != scalar) {
+        out_rows[cur_out_idx] = rows[idx];
+        out_cols[cur_out_idx] = cols[idx];
+        out_vals[cur_out_idx] = vals[idx];
+        ++cur_out_idx;
+      }
+    }
+  }
 }
 
 /**
@@ -92,7 +90,7 @@ RAFT_KERNEL coo_remove_scalar_kernel(const int* in_rows,
  * @param d_alloc device allocator for temporary buffers
  * @param stream: cuda stream to use
  */
-template <uint64_t TPB_X, typename T>
+template <int TPB_X, typename T>
 void coo_remove_scalar(const int* rows,
                        const int* cols,
                        const T* vals,
@@ -101,22 +99,27 @@ void coo_remove_scalar(const int* rows,
                        int* ccols,
                        T* cvals,
                        uint64_t* cnnz,
+                       uint64_t* cur_cnnz,
                        T scalar,
                        int n,
                        cudaStream_t stream)
 {
   rmm::device_uvector<uint64_t> ex_scan(n, stream);
+  rmm::device_uvector<uint64_t> cur_ex_scan(n, stream);
   RAFT_CUDA_TRY(cudaMemsetAsync(ex_scan.data(), 0, (uint64_t)n * sizeof(uint64_t), stream));
+  RAFT_CUDA_TRY(cudaMemsetAsync(cur_ex_scan.data(), 0, (uint64_t)n * sizeof(uint64_t), stream));
 
   thrust::device_ptr<uint64_t> dev_cnnz    = thrust::device_pointer_cast(cnnz);
   thrust::device_ptr<uint64_t> dev_ex_scan = thrust::device_pointer_cast(ex_scan.data());
   thrust::exclusive_scan(rmm::exec_policy(stream), dev_cnnz, dev_cnnz + n, dev_ex_scan);
   RAFT_CUDA_TRY(cudaPeekAtLastError());
 
-  rmm::device_uvector<int> rows_length_acc(n, stream);
-  RAFT_CUDA_TRY(cudaMemsetAsync(rows_length_acc.data(), 0, (uint64_t)n * sizeof(int), stream));
+  thrust::device_ptr<uint64_t> dev_cur_cnnz    = thrust::device_pointer_cast(cur_cnnz);
+  thrust::device_ptr<uint64_t> dev_cur_ex_scan = thrust::device_pointer_cast(cur_ex_scan.data());
+  thrust::exclusive_scan(rmm::exec_policy(stream), dev_cur_cnnz, dev_cur_cnnz + n, dev_cur_ex_scan);
+  RAFT_CUDA_TRY(cudaPeekAtLastError());
 
-  dim3 grid(raft::ceildiv(nnz, TPB_X), 1, 1);
+  dim3 grid(raft::ceildiv(n, TPB_X), 1, 1);
   dim3 blk(TPB_X, 1, 1);
 
   coo_remove_scalar_kernel<TPB_X><<<grid, blk, 0, stream>>>(rows,
@@ -127,9 +130,9 @@ void coo_remove_scalar(const int* rows,
                                                             ccols,
                                                             cvals,
                                                             dev_ex_scan.get(),
-							                                              rows_length_acc.data(),
-                                                            scalar,
-							                                              n);
+                                                            dev_cur_ex_scan.get(),
+                                                            n,
+                                                            scalar);
   RAFT_CUDA_TRY(cudaPeekAtLastError());
 }
 
@@ -145,13 +148,19 @@ template <int TPB_X, typename T>
 void coo_remove_scalar(COO<T>* in, COO<T>* out, T scalar, cudaStream_t stream)
 {
   rmm::device_uvector<uint64_t> row_count_nz(in->n_rows, stream);
+  rmm::device_uvector<uint64_t> row_count(in->n_rows, stream);
+
   RAFT_CUDA_TRY(cudaMemsetAsync(row_count_nz.data(), 0, (uint64_t)in->n_rows * sizeof(uint64_t), stream));
+  RAFT_CUDA_TRY(cudaMemsetAsync(row_count.data(), 0, (uint64_t)in->n_rows * sizeof(uint64_t), stream));
+
+  linalg::coo_degree(in->rows(), in->nnz, row_count.data(), stream);
+  RAFT_CUDA_TRY(cudaPeekAtLastError());
 
   linalg::coo_degree_scalar(in->rows(), in->vals(), in->nnz, scalar, (unsigned long long int*)row_count_nz.data(), stream);
   RAFT_CUDA_TRY(cudaPeekAtLastError());
 
   thrust::device_ptr<uint64_t> d_row_count_nz = thrust::device_pointer_cast(row_count_nz.data());
-  uint64_t out_nnz = thrust::reduce(rmm::exec_policy(stream), d_row_count_nz, d_row_count_nz + in->n_rows, (uint64_t)0);
+  uint64_t out_nnz = thrust::reduce(rmm::exec_policy(stream), d_row_count_nz, d_row_count_nz + in->n_rows);
 
   out->allocate(out_nnz, in->n_rows, in->n_cols, false, stream);
 
@@ -163,6 +172,7 @@ void coo_remove_scalar(COO<T>* in, COO<T>* out, T scalar, cudaStream_t stream)
                               out->cols(),
                               out->vals(),
                               row_count_nz.data(),
+                              row_count.data(),
                               scalar,
                               in->n_rows,
                               stream);

From 92e4af162f4e3bf80500a45a83d17dbbaec1e6ac Mon Sep 17 00:00:00 2001
From: viclafargue <viclafargue@nvidia.com>
Date: Sat, 25 Jan 2025 10:04:02 +0100
Subject: [PATCH 04/25] check style

---
 cpp/include/raft/sparse/detail/coo.cuh               |  3 +--
 cpp/include/raft/sparse/linalg/detail/norm.cuh       |  4 ++--
 cpp/include/raft/sparse/linalg/detail/symmetrize.cuh |  8 ++++----
 cpp/include/raft/sparse/linalg/norm.cuh              |  2 +-
 cpp/include/raft/sparse/op/detail/filter.cuh         | 12 ++++++++----
 cpp/include/raft/sparse/solver/detail/lanczos.cuh    |  6 ++++--
 6 files changed, 20 insertions(+), 15 deletions(-)

diff --git a/cpp/include/raft/sparse/detail/coo.cuh b/cpp/include/raft/sparse/detail/coo.cuh
index 0d2a06bf03..c41af76243 100644
--- a/cpp/include/raft/sparse/detail/coo.cuh
+++ b/cpp/include/raft/sparse/detail/coo.cuh
@@ -229,8 +229,7 @@ class COO {
    * @param init: should values be initialized to 0?
    * @param stream: stream to use for init
    */
-  void allocate(
-    uint64_t nnz, Index_Type n_rows, Index_Type n_cols, bool init, cudaStream_t stream)
+  void allocate(uint64_t nnz, Index_Type n_rows, Index_Type n_cols, bool init, cudaStream_t stream)
   {
     this->n_rows = n_rows;
     this->n_cols = n_cols;
diff --git a/cpp/include/raft/sparse/linalg/detail/norm.cuh b/cpp/include/raft/sparse/linalg/detail/norm.cuh
index 5dc5b2db41..0390fb5f69 100644
--- a/cpp/include/raft/sparse/linalg/detail/norm.cuh
+++ b/cpp/include/raft/sparse/linalg/detail/norm.cuh
@@ -48,7 +48,7 @@ RAFT_KERNEL csr_row_normalize_l1_kernel(
   const indT* ia,  // csr row ex_scan (sorted by row)
   const T* vals,
   indT nnz,  // array of values and number of non-zeros
-  int m,    // num rows in csr
+  int m,     // num rows in csr
   T* result)
 {  // output array
 
@@ -94,7 +94,7 @@ template <int TPB_X = 64, typename T, typename indT>
 void csr_row_normalize_l1(const indT* ia,  // csr row ex_scan (sorted by row)
                           const T* vals,
                           indT nnz,  // array of values and number of non-zeros
-                          int m,    // num rows in csr
+                          int m,     // num rows in csr
                           T* result,
                           cudaStream_t stream)
 {  // output array
diff --git a/cpp/include/raft/sparse/linalg/detail/symmetrize.cuh b/cpp/include/raft/sparse/linalg/detail/symmetrize.cuh
index 550a3e9505..a59fba8f8e 100644
--- a/cpp/include/raft/sparse/linalg/detail/symmetrize.cuh
+++ b/cpp/include/raft/sparse/linalg/detail/symmetrize.cuh
@@ -73,9 +73,9 @@ RAFT_KERNEL coo_symmetrize_kernel(uint64_t* row_ind,
       int cur_col = cols[start_idx + idx];
       T cur_val   = vals[start_idx + idx];
 
-      int lookup_row = cur_col;
-      uint64_t t_start    = row_ind[lookup_row];  // Start at
-      uint64_t t_stop     = get_stop_idx(lookup_row, n, cnnz, row_ind);
+      int lookup_row   = cur_col;
+      uint64_t t_start = row_ind[lookup_row];  // Start at
+      uint64_t t_stop  = get_stop_idx(lookup_row, n, cnnz, row_ind);
 
       T transpose = 0.0;
 
@@ -158,7 +158,7 @@ void coo_symmetrize(COO<T>* in,
                                                             in->n_rows,
                                                             in->nnz,
                                                             reduction_op);
-    RAFT_CUDA_TRY(cudaPeekAtLastError());
+  RAFT_CUDA_TRY(cudaPeekAtLastError());
 }
 
 /**
diff --git a/cpp/include/raft/sparse/linalg/norm.cuh b/cpp/include/raft/sparse/linalg/norm.cuh
index ea325c3472..f90d088ee6 100644
--- a/cpp/include/raft/sparse/linalg/norm.cuh
+++ b/cpp/include/raft/sparse/linalg/norm.cuh
@@ -40,7 +40,7 @@ template <typename T, typename indT>
 void csr_row_normalize_l1(const indT* ia,  // csr row ex_scan (sorted by row)
                           const T* vals,
                           indT nnz,  // array of values and number of non-zeros
-                          int m,    // num rows in csr
+                          int m,     // num rows in csr
                           T* result,
                           cudaStream_t stream)
 {  // output array
diff --git a/cpp/include/raft/sparse/op/detail/filter.cuh b/cpp/include/raft/sparse/op/detail/filter.cuh
index e4a18d48bb..41af0315d8 100644
--- a/cpp/include/raft/sparse/op/detail/filter.cuh
+++ b/cpp/include/raft/sparse/op/detail/filter.cuh
@@ -150,17 +150,21 @@ void coo_remove_scalar(COO<T>* in, COO<T>* out, T scalar, cudaStream_t stream)
   rmm::device_uvector<uint64_t> row_count_nz(in->n_rows, stream);
   rmm::device_uvector<uint64_t> row_count(in->n_rows, stream);
 
-  RAFT_CUDA_TRY(cudaMemsetAsync(row_count_nz.data(), 0, (uint64_t)in->n_rows * sizeof(uint64_t), stream));
-  RAFT_CUDA_TRY(cudaMemsetAsync(row_count.data(), 0, (uint64_t)in->n_rows * sizeof(uint64_t), stream));
+  RAFT_CUDA_TRY(
+    cudaMemsetAsync(row_count_nz.data(), 0, (uint64_t)in->n_rows * sizeof(uint64_t), stream));
+  RAFT_CUDA_TRY(
+    cudaMemsetAsync(row_count.data(), 0, (uint64_t)in->n_rows * sizeof(uint64_t), stream));
 
   linalg::coo_degree(in->rows(), in->nnz, row_count.data(), stream);
   RAFT_CUDA_TRY(cudaPeekAtLastError());
 
-  linalg::coo_degree_scalar(in->rows(), in->vals(), in->nnz, scalar, (unsigned long long int*)row_count_nz.data(), stream);
+  linalg::coo_degree_scalar(
+    in->rows(), in->vals(), in->nnz, scalar, (unsigned long long int*)row_count_nz.data(), stream);
   RAFT_CUDA_TRY(cudaPeekAtLastError());
 
   thrust::device_ptr<uint64_t> d_row_count_nz = thrust::device_pointer_cast(row_count_nz.data());
-  uint64_t out_nnz = thrust::reduce(rmm::exec_policy(stream), d_row_count_nz, d_row_count_nz + in->n_rows);
+  uint64_t out_nnz =
+    thrust::reduce(rmm::exec_policy(stream), d_row_count_nz, d_row_count_nz + in->n_rows);
 
   out->allocate(out_nnz, in->n_rows, in->n_cols, false, stream);
 
diff --git a/cpp/include/raft/sparse/solver/detail/lanczos.cuh b/cpp/include/raft/sparse/solver/detail/lanczos.cuh
index e46ea77eaa..c9749624e1 100644
--- a/cpp/include/raft/sparse/solver/detail/lanczos.cuh
+++ b/cpp/include/raft/sparse/solver/detail/lanczos.cuh
@@ -836,7 +836,8 @@ int computeSmallestEigenvectors(
   // -------------------------------------------------------
   // Check that parameters are valid
   // -------------------------------------------------------
-  RAFT_EXPECTS(nEigVecs > 0 && nEigVecs <= n, "Invalid number of eigenvectors.");
+  RAFT_EXPECTS(nEigVecs > 0 && static_cast<uint64_t>(nEigVecs) <= n,
+               "Invalid number of eigenvectors.");
   RAFT_EXPECTS(restartIter > 0, "Invalid restartIter.");
   RAFT_EXPECTS(tol > 0, "Invalid tolerance.");
   RAFT_EXPECTS(maxIter >= nEigVecs, "Invalid maxIter.");
@@ -1183,7 +1184,8 @@ int computeLargestEigenvectors(
   // -------------------------------------------------------
   // Check that parameters are valid
   // -------------------------------------------------------
-  RAFT_EXPECTS(nEigVecs > 0 && nEigVecs <= n, "Invalid number of eigenvectors.");
+  RAFT_EXPECTS(nEigVecs > 0 && static_cast<uint64_t>(nEigVecs) <= n,
+               "Invalid number of eigenvectors.");
   RAFT_EXPECTS(restartIter > 0, "Invalid restartIter.");
   RAFT_EXPECTS(tol > 0, "Invalid tolerance.");
   RAFT_EXPECTS(maxIter >= nEigVecs, "Invalid maxIter.");

From dc72acce3334214f7a3aec361e0fdf6f1bf6a2fe Mon Sep 17 00:00:00 2001
From: viclafargue <viclafargue@nvidia.com>
Date: Mon, 27 Jan 2025 16:39:05 +0000
Subject: [PATCH 05/25] compilation fix

---
 cpp/include/raft/linalg/detail/lanczos.cuh           | 2 +-
 cpp/include/raft/sparse/linalg/detail/spectral.cuh   | 5 ++---
 cpp/include/raft/spectral/detail/matrix_wrappers.hpp | 6 +++---
 3 files changed, 6 insertions(+), 7 deletions(-)

diff --git a/cpp/include/raft/linalg/detail/lanczos.cuh b/cpp/include/raft/linalg/detail/lanczos.cuh
index 134ef3ef36..6498e9c6b7 100644
--- a/cpp/include/raft/linalg/detail/lanczos.cuh
+++ b/cpp/include/raft/linalg/detail/lanczos.cuh
@@ -1004,7 +1004,7 @@ int computeSmallestEigenvectors(
   index_type_t n = A.nrows_;
 
   // Check that parameters are valid
-  RAFT_EXPECTS(nEigVecs > 0 && nEigVecs <= n, "Invalid number of eigenvectors.");
+  RAFT_EXPECTS(nEigVecs > 0 && static_cast<uint64_t>(nEigVecs) <= n, "Invalid number of eigenvectors.");
   RAFT_EXPECTS(restartIter > 0, "Invalid restartIter.");
   RAFT_EXPECTS(tol > 0, "Invalid tolerance.");
   RAFT_EXPECTS(maxIter >= nEigVecs, "Invalid maxIter.");
diff --git a/cpp/include/raft/sparse/linalg/detail/spectral.cuh b/cpp/include/raft/sparse/linalg/detail/spectral.cuh
index ad2a567294..334906c114 100644
--- a/cpp/include/raft/sparse/linalg/detail/spectral.cuh
+++ b/cpp/include/raft/sparse/linalg/detail/spectral.cuh
@@ -45,8 +45,7 @@ void fit_embedding(raft::resources const& handle,
   rmm::device_uvector<int> src_offsets(n + 1, stream);
   rmm::device_uvector<int> dst_cols(nnz, stream);
   rmm::device_uvector<T> dst_vals(nnz, stream);
-  convert::coo_to_csr(
-    handle, rows, cols, vals, nnz, n, src_offsets.data(), dst_cols.data(), dst_vals.data());
+  convert::coo_to_csr(handle, rows, cols, vals, static_cast<int>(nnz), static_cast<int>(n), src_offsets.data(), dst_cols.data(), dst_vals.data());
 
   rmm::device_uvector<T> eigVals(n_components + 1, stream);
   rmm::device_uvector<T> eigVecs(n * (n_components + 1), stream);
@@ -65,7 +64,7 @@ void fit_embedding(raft::resources const& handle,
   value_type* vs = dst_vals.data();
 
   raft::spectral::matrix::sparse_matrix_t<index_type, value_type> const r_csr_m{
-    handle, ro, ci, vs, n, nnz};
+    handle, ro, ci, vs, static_cast<int>(n), nnz};
 
   index_type neigvs       = n_components + 1;
   index_type maxiter      = 4000;  // default reset value (when set to 0);
diff --git a/cpp/include/raft/spectral/detail/matrix_wrappers.hpp b/cpp/include/raft/spectral/detail/matrix_wrappers.hpp
index db8a5dc9ef..91094b4be8 100644
--- a/cpp/include/raft/spectral/detail/matrix_wrappers.hpp
+++ b/cpp/include/raft/spectral/detail/matrix_wrappers.hpp
@@ -142,7 +142,7 @@ struct sparse_matrix_t {
                   value_type const* values,
                   index_type const nrows,
                   index_type const ncols,
-                  index_type const nnz)
+                  uint64_t const nnz)
     : handle_(raft_handle),
       row_offsets_(row_offsets),
       col_indices_(col_indices),
@@ -158,7 +158,7 @@ struct sparse_matrix_t {
                   index_type const* col_indices,
                   value_type const* values,
                   index_type const nrows,
-                  index_type const nnz)
+                  uint64_t const nnz)
     : handle_(raft_handle),
       row_offsets_(row_offsets),
       col_indices_(col_indices),
@@ -311,7 +311,7 @@ struct sparse_matrix_t {
   value_type const* values_;
   index_type const nrows_;
   index_type const ncols_;
-  index_type const nnz_;
+  uint64_t const nnz_;
 };
 
 template <typename index_type, typename value_type>

From 8e58d293a4a6ff14efbad9cb4dcf496336c996b9 Mon Sep 17 00:00:00 2001
From: viclafargue <viclafargue@nvidia.com>
Date: Mon, 27 Jan 2025 18:54:45 +0000
Subject: [PATCH 06/25] fix tests

---
 cpp/tests/linalg/eigen_solvers.cu   | 2 +-
 cpp/tests/sparse/solver/lanczos.cu  | 4 ++--
 cpp/tests/sparse/spectral_matrix.cu | 2 +-
 3 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/cpp/tests/linalg/eigen_solvers.cu b/cpp/tests/linalg/eigen_solvers.cu
index cf75ff89bf..e18b3a9f29 100644
--- a/cpp/tests/linalg/eigen_solvers.cu
+++ b/cpp/tests/linalg/eigen_solvers.cu
@@ -46,7 +46,7 @@ TEST(Raft, EigenSolvers)
   index_type nnz   = 0;
   index_type nrows = 0;
 
-  sparse_matrix_t<index_type, value_type> sm1{h, ro, ci, vs, nrows, nnz};
+  sparse_matrix_t<index_type, value_type> sm1{h, ro, ci, vs, nrows, static_cast<uint64_t>(nnz)};
   ASSERT_EQ(nullptr, sm1.row_offsets_);
 
   index_type neigvs{10};
diff --git a/cpp/tests/sparse/solver/lanczos.cu b/cpp/tests/sparse/solver/lanczos.cu
index 23b3a7ff99..2f5de6870d 100644
--- a/cpp/tests/sparse/solver/lanczos.cu
+++ b/cpp/tests/sparse/solver/lanczos.cu
@@ -173,7 +173,7 @@ class rmat_lanczos_tests
       raft::make_device_vector<IndexType, uint32_t, raft::row_major>(handle,
                                                                      symmetric_coo.n_rows + 1);
     raft::sparse::convert::sorted_coo_to_csr(symmetric_coo.rows(),
-                                             (IndexType)symmetric_coo.nnz,
+                                             static_cast<int>(symmetric_coo.nnz),
                                              row_indices.data_handle(),
                                              symmetric_coo.n_rows + 1,
                                              stream);
@@ -198,7 +198,7 @@ class rmat_lanczos_tests
       symmetric_coo.cols(),
       symmetric_coo.vals(),
       symmetric_coo.n_rows,
-      (IndexType)symmetric_coo.nnz};
+      symmetric_coo.nnz};
     raft::sparse::solver::lanczos_solver_config<ValueType> config{
       n_components, params.maxiter, params.restartiter, params.tol, rng.seed};
 
diff --git a/cpp/tests/sparse/spectral_matrix.cu b/cpp/tests/sparse/spectral_matrix.cu
index 52f7eff10e..f37af32fb9 100644
--- a/cpp/tests/sparse/spectral_matrix.cu
+++ b/cpp/tests/sparse/spectral_matrix.cu
@@ -55,7 +55,7 @@ TEST(Raft, SpectralMatrices)
   value_type* vs{nullptr};
   index_type nnz   = 0;
   index_type nrows = 0;
-  sparse_matrix_t<index_type, value_type> sm1{h, ro, ci, vs, nrows, nnz};
+  sparse_matrix_t<index_type, value_type> sm1{h, ro, ci, vs, nrows, static_cast<uint64_t>(nnz)};
   sparse_matrix_t<index_type, value_type> sm2{h, csr_v};
   ASSERT_EQ(nullptr, sm1.row_offsets_);
   ASSERT_EQ(nullptr, sm2.row_offsets_);

From c75cd87992f45ec36bd5d596646159c2e2b5cc17 Mon Sep 17 00:00:00 2001
From: Dante Gama Dessavre <danteg@nvidia.com>
Date: Tue, 28 Jan 2025 13:06:36 -0600
Subject: [PATCH 07/25] FIX style fixes

---
 cpp/include/raft/linalg/detail/lanczos.cuh         |  3 ++-
 cpp/include/raft/sparse/linalg/detail/spectral.cuh | 10 +++++++++-
 2 files changed, 11 insertions(+), 2 deletions(-)

diff --git a/cpp/include/raft/linalg/detail/lanczos.cuh b/cpp/include/raft/linalg/detail/lanczos.cuh
index 6498e9c6b7..d4d3325f77 100644
--- a/cpp/include/raft/linalg/detail/lanczos.cuh
+++ b/cpp/include/raft/linalg/detail/lanczos.cuh
@@ -1004,7 +1004,8 @@ int computeSmallestEigenvectors(
   index_type_t n = A.nrows_;
 
   // Check that parameters are valid
-  RAFT_EXPECTS(nEigVecs > 0 && static_cast<uint64_t>(nEigVecs) <= n, "Invalid number of eigenvectors.");
+  RAFT_EXPECTS(nEigVecs > 0 && static_cast<uint64_t>(nEigVecs) <= n,
+               "Invalid number of eigenvectors.");
   RAFT_EXPECTS(restartIter > 0, "Invalid restartIter.");
   RAFT_EXPECTS(tol > 0, "Invalid tolerance.");
   RAFT_EXPECTS(maxIter >= nEigVecs, "Invalid maxIter.");
diff --git a/cpp/include/raft/sparse/linalg/detail/spectral.cuh b/cpp/include/raft/sparse/linalg/detail/spectral.cuh
index 334906c114..90e698f0b9 100644
--- a/cpp/include/raft/sparse/linalg/detail/spectral.cuh
+++ b/cpp/include/raft/sparse/linalg/detail/spectral.cuh
@@ -45,7 +45,15 @@ void fit_embedding(raft::resources const& handle,
   rmm::device_uvector<int> src_offsets(n + 1, stream);
   rmm::device_uvector<int> dst_cols(nnz, stream);
   rmm::device_uvector<T> dst_vals(nnz, stream);
-  convert::coo_to_csr(handle, rows, cols, vals, static_cast<int>(nnz), static_cast<int>(n), src_offsets.data(), dst_cols.data(), dst_vals.data());
+  convert::coo_to_csr(handle,
+                      rows,
+                      cols,
+                      vals,
+                      static_cast<int>(nnz),
+                      static_cast<int>(n),
+                      src_offsets.data(),
+                      dst_cols.data(),
+                      dst_vals.data());
 
   rmm::device_uvector<T> eigVals(n_components + 1, stream);
   rmm::device_uvector<T> eigVecs(n * (n_components + 1), stream);

From 694d371661c4ac97955ad2e6324a3ae9734f2aa9 Mon Sep 17 00:00:00 2001
From: viclafargue <viclafargue@nvidia.com>
Date: Wed, 29 Jan 2025 19:53:22 +0000
Subject: [PATCH 08/25] changes so far

---
 .../raft/cluster/detail/connectivities.cuh    |  2 +-
 cpp/include/raft/cluster/detail/mst.cuh       |  2 +-
 cpp/include/raft/sparse/convert/csr.cuh       |  8 +--
 .../raft/sparse/convert/detail/csr.cuh        |  2 +-
 cpp/include/raft/sparse/detail/coo.cuh        | 67 ++++++++++++-------
 cpp/include/raft/sparse/linalg/degree.cuh     | 16 ++---
 .../neighbors/detail/cross_component_nn.cuh   |  2 +-
 .../raft/spatial/knn/detail/ball_cover.cuh    |  2 +-
 .../raft/spectral/detail/matrix_wrappers.hpp  |  8 +--
 cpp/tests/linalg/eigen_solvers.cu             |  2 +-
 cpp/tests/sparse/solver/lanczos.cu            |  2 +-
 cpp/tests/sparse/spectral_matrix.cu           |  2 +-
 cpp/tests/sparse/symmetrize.cu                |  2 +-
 13 files changed, 69 insertions(+), 48 deletions(-)

diff --git a/cpp/include/raft/cluster/detail/connectivities.cuh b/cpp/include/raft/cluster/detail/connectivities.cuh
index fdb8af9171..c527b754c3 100644
--- a/cpp/include/raft/cluster/detail/connectivities.cuh
+++ b/cpp/include/raft/cluster/detail/connectivities.cuh
@@ -95,7 +95,7 @@ struct distance_graph_impl<raft::cluster::LinkageDistance::KNN_GRAPH, value_idx,
                       });
 
     raft::sparse::convert::sorted_coo_to_csr(
-      knn_graph_coo.rows(), (value_idx)knn_graph_coo.nnz, indptr.data(), m + 1, stream);
+      knn_graph_coo.rows(), knn_graph_coo.nnz, indptr.data(), m + 1, stream);
 
     // TODO: Wouldn't need to copy here if we could compute knn
     // graph directly on the device uvectors
diff --git a/cpp/include/raft/cluster/detail/mst.cuh b/cpp/include/raft/cluster/detail/mst.cuh
index fff2a535ec..2b77ca9963 100644
--- a/cpp/include/raft/cluster/detail/mst.cuh
+++ b/cpp/include/raft/cluster/detail/mst.cuh
@@ -98,7 +98,7 @@ void connect_knn_graph(
 
   rmm::device_uvector<value_idx> indptr2(m + 1, stream);
   raft::sparse::convert::sorted_coo_to_csr(
-    connected_edges.rows(), (value_idx)connected_edges.nnz, indptr2.data(), m + 1, stream);
+    connected_edges.rows(), connected_edges.nnz, indptr2.data(), m + 1, stream);
 
   // On the second call, we hand the MST the original colors
   // and the new set of edges and let it restart the optimization process
diff --git a/cpp/include/raft/sparse/convert/csr.cuh b/cpp/include/raft/sparse/convert/csr.cuh
index 188928bbaa..1478f1c42e 100644
--- a/cpp/include/raft/sparse/convert/csr.cuh
+++ b/cpp/include/raft/sparse/convert/csr.cuh
@@ -54,10 +54,10 @@ void coo_to_csr(raft::resources const& handle,
  * @param m: number of rows in dense matrix
  * @param stream: cuda stream to use
  */
-template <typename T, typename outT>
-void sorted_coo_to_csr(const T* rows, outT nnz, outT* row_ind, int m, cudaStream_t stream)
+template <typename T, typename nnz_type, typename outT>
+void sorted_coo_to_csr(const T* rows, nnz_type nnz, outT* row_ind, int m, cudaStream_t stream)
 {
-  detail::sorted_coo_to_csr(rows, nnz, row_ind, m, stream);
+  detail::sorted_coo_to_csr(rows, (uint64_t)nnz, row_ind, m, stream);
 }
 
 /**
@@ -70,7 +70,7 @@ void sorted_coo_to_csr(const T* rows, outT nnz, outT* row_ind, int m, cudaStream
 template <typename T, typename outT>
 void sorted_coo_to_csr(COO<T>* coo, outT* row_ind, cudaStream_t stream)
 {
-  detail::sorted_coo_to_csr(coo->rows(), (outT)coo->nnz, row_ind, coo->n_rows, stream);
+  detail::sorted_coo_to_csr(coo->rows(), coo->safe_nnz, row_ind, coo->n_rows, stream);
 }
 
 /**
diff --git a/cpp/include/raft/sparse/convert/detail/csr.cuh b/cpp/include/raft/sparse/convert/detail/csr.cuh
index aac5b26786..44d6f21f9d 100644
--- a/cpp/include/raft/sparse/convert/detail/csr.cuh
+++ b/cpp/include/raft/sparse/convert/detail/csr.cuh
@@ -85,7 +85,7 @@ void coo_to_csr(raft::resources const& handle,
  * @param stream: cuda stream to use
  */
 template <typename T, typename outT>
-void sorted_coo_to_csr(const T* rows, outT nnz, outT* row_ind, int m, cudaStream_t stream)
+void sorted_coo_to_csr(const T* rows, uint64_t nnz, outT* row_ind, int m, cudaStream_t stream)
 {
   rmm::device_uvector<outT> row_counts(m, stream);
   RAFT_CUDA_TRY(cudaMemsetAsync(row_counts.data(), 0, (uint64_t)m * sizeof(outT), stream));
diff --git a/cpp/include/raft/sparse/detail/coo.cuh b/cpp/include/raft/sparse/detail/coo.cuh
index c41af76243..38cd33cb3e 100644
--- a/cpp/include/raft/sparse/detail/coo.cuh
+++ b/cpp/include/raft/sparse/detail/coo.cuh
@@ -52,7 +52,8 @@ class COO {
   rmm::device_uvector<T> vals_arr;
 
  public:
-  uint64_t nnz;
+  Index_Type nnz;
+  uint64_t safe_nnz;
   Index_Type n_rows;
   Index_Type n_cols;
 
@@ -60,7 +61,13 @@ class COO {
    * @param stream: CUDA stream to use
    */
   COO(cudaStream_t stream)
-    : rows_arr(0, stream), cols_arr(0, stream), vals_arr(0, stream), nnz(0), n_rows(0), n_cols(0)
+    : rows_arr(0, stream),
+      cols_arr(0, stream),
+      vals_arr(0, stream),
+      nnz(0),
+      n_rows(0),
+      n_cols(0),
+      safe_nnz(0)
   {
   }
 
@@ -72,13 +79,20 @@ class COO {
    * @param n_rows: number of rows in the dense matrix
    * @param n_cols: number of cols in the dense matrix
    */
+  template <typename SafeNNZ_Type>
   COO(rmm::device_uvector<Index_Type>& rows,
       rmm::device_uvector<Index_Type>& cols,
       rmm::device_uvector<T>& vals,
-      uint64_t nnz,
+      SafeNNZ_Type nnz,
       Index_Type n_rows = 0,
       Index_Type n_cols = 0)
-    : rows_arr(rows), cols_arr(cols), vals_arr(vals), nnz(nnz), n_rows(n_rows), n_cols(n_cols)
+    : rows_arr(rows),
+      cols_arr(cols),
+      vals_arr(vals),
+      nnz((Index_Type)nnz),
+      n_rows(n_rows),
+      n_cols(n_cols),
+      safe_nnz((uint64_t)nnz)
   {
   }
 
@@ -89,8 +103,9 @@ class COO {
    * @param n_cols: number of cols in the dense matrix
    * @param init: initialize arrays with zeros
    */
+  template <typename SafeNNZ_Type>
   COO(cudaStream_t stream,
-      uint64_t nnz,
+      SafeNNZ_Type nnz,
       Index_Type n_rows = 0,
       Index_Type n_cols = 0,
       bool init         = true)
@@ -99,7 +114,8 @@ class COO {
       vals_arr(nnz, stream),
       nnz(nnz),
       n_rows(n_rows),
-      n_cols(n_cols)
+      n_cols(n_cols),
+      safe_nnz(nnz)
   {
     if (init) init_arrays(stream);
   }
@@ -107,10 +123,10 @@ class COO {
   void init_arrays(cudaStream_t stream)
   {
     RAFT_CUDA_TRY(
-      cudaMemsetAsync(this->rows_arr.data(), 0, this->nnz * sizeof(Index_Type), stream));
+      cudaMemsetAsync(this->rows_arr.data(), 0, this->safe_nnz * sizeof(Index_Type), stream));
     RAFT_CUDA_TRY(
-      cudaMemsetAsync(this->cols_arr.data(), 0, this->nnz * sizeof(Index_Type), stream));
-    RAFT_CUDA_TRY(cudaMemsetAsync(this->vals_arr.data(), 0, this->nnz * sizeof(T), stream));
+      cudaMemsetAsync(this->cols_arr.data(), 0, this->safe_nnz * sizeof(Index_Type), stream));
+    RAFT_CUDA_TRY(cudaMemsetAsync(this->vals_arr.data(), 0, this->safe_nnz * sizeof(T), stream));
   }
 
   ~COO() {}
@@ -121,7 +137,7 @@ class COO {
    */
   bool validate_size() const
   {
-    if (this->nnz <= 0 || n_rows <= 0 || n_cols <= 0) return false;
+    if (this->safe_nnz <= 0 || n_rows <= 0 || n_cols <= 0) return false;
     return true;
   }
 
@@ -162,10 +178,10 @@ class COO {
       cudaStream_t stream;
       RAFT_CUDA_TRY(cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking));
 
-      out << raft::arr2Str(c.rows_arr.data(), c.nnz, "rows", stream) << std::endl;
-      out << raft::arr2Str(c.cols_arr.data(), c.nnz, "cols", stream) << std::endl;
-      out << raft::arr2Str(c.vals_arr.data(), c.nnz, "vals", stream) << std::endl;
-      out << "nnz=" << c.nnz << std::endl;
+      out << raft::arr2Str(c.rows_arr.data(), c.safe_nnz, "rows", stream) << std::endl;
+      out << raft::arr2Str(c.cols_arr.data(), c.safe_nnz, "cols", stream) << std::endl;
+      out << raft::arr2Str(c.vals_arr.data(), c.safe_nnz, "vals", stream) << std::endl;
+      out << "nnz=" << c.safe_nnz << std::endl;
       out << "n_rows=" << c.n_rows << std::endl;
       out << "n_cols=" << c.n_cols << std::endl;
 
@@ -204,7 +220,8 @@ class COO {
    * @param init: should values be initialized to 0?
    * @param stream: CUDA stream to use
    */
-  void allocate(uint64_t nnz, bool init, cudaStream_t stream)
+  template <typename SafeNNZ_Type>
+  void allocate(SafeNNZ_Type nnz, bool init, cudaStream_t stream)
   {
     this->allocate(nnz, 0, init, stream);
   }
@@ -216,7 +233,8 @@ class COO {
    * @param init: should values be initialized to 0?
    * @param stream: CUDA stream to use
    */
-  void allocate(uint64_t nnz, Index_Type size, bool init, cudaStream_t stream)
+  template <typename SafeNNZ_Type>
+  void allocate(SafeNNZ_Type nnz, Index_Type size, bool init, cudaStream_t stream)
   {
     this->allocate(nnz, size, size, init, stream);
   }
@@ -229,15 +247,18 @@ class COO {
    * @param init: should values be initialized to 0?
    * @param stream: stream to use for init
    */
-  void allocate(uint64_t nnz, Index_Type n_rows, Index_Type n_cols, bool init, cudaStream_t stream)
+  template <typename SafeNNZ_Type>
+  void allocate(
+    SafeNNZ_Type nnz, Index_Type n_rows, Index_Type n_cols, bool init, cudaStream_t stream)
   {
-    this->n_rows = n_rows;
-    this->n_cols = n_cols;
-    this->nnz    = nnz;
+    this->n_rows   = n_rows;
+    this->n_cols   = n_cols;
+    this->nnz      = (Index_Type)nnz;
+    this->safe_nnz = nnz;
 
-    this->rows_arr.resize(this->nnz, stream);
-    this->cols_arr.resize(this->nnz, stream);
-    this->vals_arr.resize(this->nnz, stream);
+    this->rows_arr.resize(nnz, stream);
+    this->cols_arr.resize(nnz, stream);
+    this->vals_arr.resize(nnz, stream);
 
     if (init) init_arrays(stream);
   }
diff --git a/cpp/include/raft/sparse/linalg/degree.cuh b/cpp/include/raft/sparse/linalg/degree.cuh
index a154c2357c..9e17b01c75 100644
--- a/cpp/include/raft/sparse/linalg/degree.cuh
+++ b/cpp/include/raft/sparse/linalg/degree.cuh
@@ -33,10 +33,10 @@ namespace linalg {
  * @param results: output result array
  * @param stream: cuda stream to use
  */
-template <typename T = int, typename outT>
-void coo_degree(const T* rows, uint64_t nnz, outT* results, cudaStream_t stream)
+template <typename T = int, typename nnz_type, typename outT>
+void coo_degree(const T* rows, nnz_type nnz, outT* results, cudaStream_t stream)
 {
-  detail::coo_degree<64, T>(rows, nnz, results, stream);
+  detail::coo_degree<64, T>(rows, (uint64_t)nnz, results, stream);
 }
 
 /**
@@ -50,7 +50,7 @@ void coo_degree(const T* rows, uint64_t nnz, outT* results, cudaStream_t stream)
 template <typename T, typename outT>
 void coo_degree(COO<T>* in, outT* results, cudaStream_t stream)
 {
-  coo_degree(in->rows(), in->nnz, results, stream);
+  coo_degree(in->rows(), in->safe_nnz, results, stream);
 }
 
 /**
@@ -64,11 +64,11 @@ void coo_degree(COO<T>* in, outT* results, cudaStream_t stream)
  * @param results: output row counts
  * @param stream: cuda stream to use
  */
-template <typename T, typename outT>
+template <typename T, typename nnz_type, typename outT>
 void coo_degree_scalar(
-  const int* rows, const T* vals, uint64_t nnz, T scalar, outT* results, cudaStream_t stream = 0)
+  const int* rows, const T* vals, nnz_type nnz, T scalar, outT* results, cudaStream_t stream = 0)
 {
-  detail::coo_degree_scalar<64>(rows, vals, nnz, scalar, results, stream);
+  detail::coo_degree_scalar<64>(rows, vals, (uint64_t)nnz, scalar, results, stream);
 }
 
 /**
@@ -83,7 +83,7 @@ void coo_degree_scalar(
 template <typename T, typename outT>
 void coo_degree_scalar(COO<T>* in, T scalar, outT* results, cudaStream_t stream)
 {
-  coo_degree_scalar(in->rows(), in->vals(), in->nnz, scalar, results, stream);
+  coo_degree_scalar(in->rows(), in->vals(), in->safe_nnz, scalar, results, stream);
 }
 
 /**
diff --git a/cpp/include/raft/sparse/neighbors/detail/cross_component_nn.cuh b/cpp/include/raft/sparse/neighbors/detail/cross_component_nn.cuh
index 1247b91d2e..a47d5a6f34 100644
--- a/cpp/include/raft/sparse/neighbors/detail/cross_component_nn.cuh
+++ b/cpp/include/raft/sparse/neighbors/detail/cross_component_nn.cuh
@@ -242,7 +242,7 @@ void perform_1nn(raft::resources const& handle,
   // the color components.
   auto colors_group_idxs = raft::make_device_vector<value_idx, value_idx>(handle, n_components + 1);
   raft::sparse::convert::sorted_coo_to_csr(
-    colors, (value_idx)n_rows, colors_group_idxs.data_handle(), n_components + 1, stream);
+    colors, n_rows, colors_group_idxs.data_handle(), n_components + 1, stream);
 
   auto group_idxs_view = raft::make_device_vector_view<const value_idx, value_idx>(
     colors_group_idxs.data_handle() + 1, n_components);
diff --git a/cpp/include/raft/spatial/knn/detail/ball_cover.cuh b/cpp/include/raft/spatial/knn/detail/ball_cover.cuh
index f436d5c740..c4ca2ffa61 100644
--- a/cpp/include/raft/spatial/knn/detail/ball_cover.cuh
+++ b/cpp/include/raft/spatial/knn/detail/ball_cover.cuh
@@ -161,7 +161,7 @@ void construct_landmark_1nn(raft::resources const& handle,
 
   // convert to CSR for fast lookup
   raft::sparse::convert::sorted_coo_to_csr(R_1nn_inds.data(),
-                                           (value_idx)index.m,
+                                           index.m,
                                            index.get_R_indptr().data_handle(),
                                            index.n_landmarks + 1,
                                            resource::get_cuda_stream(handle));
diff --git a/cpp/include/raft/spectral/detail/matrix_wrappers.hpp b/cpp/include/raft/spectral/detail/matrix_wrappers.hpp
index 91094b4be8..bab5b4515a 100644
--- a/cpp/include/raft/spectral/detail/matrix_wrappers.hpp
+++ b/cpp/include/raft/spectral/detail/matrix_wrappers.hpp
@@ -134,7 +134,7 @@ class vector_t {
   const thrust_exec_policy_t thrust_policy;
 };
 
-template <typename index_type, typename value_type>
+template <typename index_type, typename value_type, typename nnz_type = index_type>
 struct sparse_matrix_t {
   sparse_matrix_t(resources const& raft_handle,
                   index_type const* row_offsets,
@@ -142,7 +142,7 @@ struct sparse_matrix_t {
                   value_type const* values,
                   index_type const nrows,
                   index_type const ncols,
-                  uint64_t const nnz)
+                  nnz_type const nnz)
     : handle_(raft_handle),
       row_offsets_(row_offsets),
       col_indices_(col_indices),
@@ -158,7 +158,7 @@ struct sparse_matrix_t {
                   index_type const* col_indices,
                   value_type const* values,
                   index_type const nrows,
-                  uint64_t const nnz)
+                  nnz_type const nnz)
     : handle_(raft_handle),
       row_offsets_(row_offsets),
       col_indices_(col_indices),
@@ -311,7 +311,7 @@ struct sparse_matrix_t {
   value_type const* values_;
   index_type const nrows_;
   index_type const ncols_;
-  uint64_t const nnz_;
+  nnz_type const nnz_;
 };
 
 template <typename index_type, typename value_type>
diff --git a/cpp/tests/linalg/eigen_solvers.cu b/cpp/tests/linalg/eigen_solvers.cu
index e18b3a9f29..cf75ff89bf 100644
--- a/cpp/tests/linalg/eigen_solvers.cu
+++ b/cpp/tests/linalg/eigen_solvers.cu
@@ -46,7 +46,7 @@ TEST(Raft, EigenSolvers)
   index_type nnz   = 0;
   index_type nrows = 0;
 
-  sparse_matrix_t<index_type, value_type> sm1{h, ro, ci, vs, nrows, static_cast<uint64_t>(nnz)};
+  sparse_matrix_t<index_type, value_type> sm1{h, ro, ci, vs, nrows, nnz};
   ASSERT_EQ(nullptr, sm1.row_offsets_);
 
   index_type neigvs{10};
diff --git a/cpp/tests/sparse/solver/lanczos.cu b/cpp/tests/sparse/solver/lanczos.cu
index 2f5de6870d..128ab73747 100644
--- a/cpp/tests/sparse/solver/lanczos.cu
+++ b/cpp/tests/sparse/solver/lanczos.cu
@@ -173,7 +173,7 @@ class rmat_lanczos_tests
       raft::make_device_vector<IndexType, uint32_t, raft::row_major>(handle,
                                                                      symmetric_coo.n_rows + 1);
     raft::sparse::convert::sorted_coo_to_csr(symmetric_coo.rows(),
-                                             static_cast<int>(symmetric_coo.nnz),
+                                             symmetric_coo.nnz,
                                              row_indices.data_handle(),
                                              symmetric_coo.n_rows + 1,
                                              stream);
diff --git a/cpp/tests/sparse/spectral_matrix.cu b/cpp/tests/sparse/spectral_matrix.cu
index f37af32fb9..52f7eff10e 100644
--- a/cpp/tests/sparse/spectral_matrix.cu
+++ b/cpp/tests/sparse/spectral_matrix.cu
@@ -55,7 +55,7 @@ TEST(Raft, SpectralMatrices)
   value_type* vs{nullptr};
   index_type nnz   = 0;
   index_type nrows = 0;
-  sparse_matrix_t<index_type, value_type> sm1{h, ro, ci, vs, nrows, static_cast<uint64_t>(nnz)};
+  sparse_matrix_t<index_type, value_type> sm1{h, ro, ci, vs, nrows, nnz};
   sparse_matrix_t<index_type, value_type> sm2{h, csr_v};
   ASSERT_EQ(nullptr, sm1.row_offsets_);
   ASSERT_EQ(nullptr, sm2.row_offsets_);
diff --git a/cpp/tests/sparse/symmetrize.cu b/cpp/tests/sparse/symmetrize.cu
index c3a03a942c..5f7463ac89 100644
--- a/cpp/tests/sparse/symmetrize.cu
+++ b/cpp/tests/sparse/symmetrize.cu
@@ -109,7 +109,7 @@ class SparseSymmetrizeTest
     rmm::device_scalar<value_idx> sum(stream);
     sum.set_value_to_zero_async(stream);
 
-    assert_symmetry<<<raft::ceildiv(out.nnz, (uint64_t)256), 256, 0, stream>>>(
+    assert_symmetry<<<raft::ceildiv(out.nnz, 256), 256, 0, stream>>>(
       out.rows(), out.cols(), out.vals(), (value_idx)out.nnz, sum.data());
 
     sum_h = sum.value(stream);

From 48813134d38051a9ec1a11ecbbba02b4461a44e4 Mon Sep 17 00:00:00 2001
From: "Corey J. Nolet" <cjnolet@gmail.com>
Date: Wed, 29 Jan 2025 15:31:55 -0500
Subject: [PATCH 09/25] Exposing more templates for sparse types

---
 cpp/include/raft/sparse/convert/csr.cuh       |  2 +-
 cpp/include/raft/sparse/linalg/degree.cuh     |  2 +-
 .../raft/sparse/linalg/detail/degree.cuh      | 24 ++++-----
 .../raft/sparse/linalg/detail/spectral.cuh    |  6 +--
 .../raft/sparse/linalg/detail/symmetrize.cuh  | 22 ++++----
 cpp/include/raft/sparse/linalg/spectral.cuh   |  6 +--
 cpp/include/raft/sparse/op/detail/filter.cuh  | 52 ++++++++++---------
 cpp/include/raft/sparse/op/detail/sort.h      | 11 ++--
 cpp/include/raft/sparse/op/sort.cuh           | 11 ++--
 9 files changed, 71 insertions(+), 65 deletions(-)

diff --git a/cpp/include/raft/sparse/convert/csr.cuh b/cpp/include/raft/sparse/convert/csr.cuh
index 1478f1c42e..53a025ec71 100644
--- a/cpp/include/raft/sparse/convert/csr.cuh
+++ b/cpp/include/raft/sparse/convert/csr.cuh
@@ -57,7 +57,7 @@ void coo_to_csr(raft::resources const& handle,
 template <typename T, typename nnz_type, typename outT>
 void sorted_coo_to_csr(const T* rows, nnz_type nnz, outT* row_ind, int m, cudaStream_t stream)
 {
-  detail::sorted_coo_to_csr(rows, (uint64_t)nnz, row_ind, m, stream);
+  detail::sorted_coo_to_csr(rows, nnz, row_ind, m, stream);
 }
 
 /**
diff --git a/cpp/include/raft/sparse/linalg/degree.cuh b/cpp/include/raft/sparse/linalg/degree.cuh
index 9e17b01c75..965c923993 100644
--- a/cpp/include/raft/sparse/linalg/degree.cuh
+++ b/cpp/include/raft/sparse/linalg/degree.cuh
@@ -36,7 +36,7 @@ namespace linalg {
 template <typename T = int, typename nnz_type, typename outT>
 void coo_degree(const T* rows, nnz_type nnz, outT* results, cudaStream_t stream)
 {
-  detail::coo_degree<64, T>(rows, (uint64_t)nnz, results, stream);
+  detail::coo_degree<64, T>(rows, (nnz_type)nnz, results, stream);
 }
 
 /**
diff --git a/cpp/include/raft/sparse/linalg/detail/degree.cuh b/cpp/include/raft/sparse/linalg/detail/degree.cuh
index d7f72e9b6c..0163e15878 100644
--- a/cpp/include/raft/sparse/linalg/detail/degree.cuh
+++ b/cpp/include/raft/sparse/linalg/detail/degree.cuh
@@ -39,8 +39,8 @@ namespace detail {
  * @param nnz the size of the rows array
  * @param results array to place results
  */
-template <uint64_t TPB_X = 64, typename T = int, typename outT>
-RAFT_KERNEL coo_degree_kernel(const T* rows, uint64_t nnz, outT* results)
+template <uint64_t TPB_X = 64, typename T = int, typename outT, typename nnz_t>
+RAFT_KERNEL coo_degree_kernel(const T* rows, nnz_t nnz, outT* results)
 {
   uint64_t row = (blockIdx.x * TPB_X) + threadIdx.x;
   if (row < nnz) { atomicAdd(results + rows[row], (outT)1); }
@@ -54,8 +54,8 @@ RAFT_KERNEL coo_degree_kernel(const T* rows, uint64_t nnz, outT* results)
  * @param results: output result array
  * @param stream: cuda stream to use
  */
-template <uint64_t TPB_X = 64, typename T = int, typename outT>
-void coo_degree(const T* rows, uint64_t nnz, outT* results, cudaStream_t stream)
+template <uint64_t TPB_X = 64, typename T = int, typename outT, typename nnz_t>
+void coo_degree(const T* rows, nnz_t nnz, outT* results, cudaStream_t stream)
 {
   dim3 grid_rc(raft::ceildiv(nnz, TPB_X), 1, 1);
   dim3 blk_rc(TPB_X, 1, 1);
@@ -64,16 +64,16 @@ void coo_degree(const T* rows, uint64_t nnz, outT* results, cudaStream_t stream)
   RAFT_CUDA_TRY(cudaGetLastError());
 }
 
-template <int TPB_X = 64, typename T>
-RAFT_KERNEL coo_degree_nz_kernel(const int* rows, const T* vals, int nnz, int* results)
+template <int TPB_X = 64, typename T, typename nnz_t>
+RAFT_KERNEL coo_degree_nz_kernel(const int* rows, const T* vals, nnz_t nnz, int* results)
 {
   int row = (blockIdx.x * TPB_X) + threadIdx.x;
   if (row < nnz && vals[row] != 0.0) { raft::myAtomicAdd(results + rows[row], 1); }
 }
 
-template <uint64_t TPB_X = 64, typename T, typename outT>
+template <uint64_t TPB_X = 64, typename T, typename outT, typename nnz_t>
 RAFT_KERNEL coo_degree_scalar_kernel(
-  const int* rows, const T* vals, uint64_t nnz, T scalar, outT* results)
+  const int* rows, const T* vals, nnz_t nnz, T scalar, outT* results)
 {
   uint64_t row = (blockIdx.x * TPB_X) + threadIdx.x;
   if (row < nnz && vals[row] != scalar) { raft::myAtomicAdd((outT*)results + rows[row], (outT)1); }
@@ -90,9 +90,9 @@ RAFT_KERNEL coo_degree_scalar_kernel(
  * @param results: output row counts
  * @param stream: cuda stream to use
  */
-template <uint64_t TPB_X = 64, typename T, typename outT>
+template <uint64_t TPB_X = 64, typename T, typename outT, typename nnz_t>
 void coo_degree_scalar(
-  const int* rows, const T* vals, uint64_t nnz, T scalar, outT* results, cudaStream_t stream = 0)
+  const int* rows, const T* vals, nnz_t nnz, T scalar, outT* results, cudaStream_t stream = 0)
 {
   dim3 grid_rc(raft::ceildiv(nnz, TPB_X), 1, 1);
   dim3 blk_rc(TPB_X, 1, 1);
@@ -110,8 +110,8 @@ void coo_degree_scalar(
  * @param results: output row counts
  * @param stream: cuda stream to use
  */
-template <int TPB_X = 64, typename T>
-void coo_degree_nz(const int* rows, const T* vals, int nnz, int* results, cudaStream_t stream)
+template <int TPB_X = 64, typename T, typename nnz_t>
+void coo_degree_nz(const int* rows, const T* vals, nnz_t nnz, int* results, cudaStream_t stream)
 {
   dim3 grid_rc(raft::ceildiv(nnz, TPB_X), 1, 1);
   dim3 blk_rc(TPB_X, 1, 1);
diff --git a/cpp/include/raft/sparse/linalg/detail/spectral.cuh b/cpp/include/raft/sparse/linalg/detail/spectral.cuh
index 90e698f0b9..103030cbad 100644
--- a/cpp/include/raft/sparse/linalg/detail/spectral.cuh
+++ b/cpp/include/raft/sparse/linalg/detail/spectral.cuh
@@ -30,13 +30,13 @@ namespace sparse {
 namespace spectral {
 namespace detail {
 
-template <typename T>
+template <typename T, typename IndT, typename nnz_t>
 void fit_embedding(raft::resources const& handle,
                    int* rows,
                    int* cols,
                    T* vals,
-                   uint64_t nnz,
-                   uint64_t n,
+                   nnz_t nnz,
+                   IndT n,
                    int n_components,
                    T* out,
                    unsigned long long seed = 1234567)
diff --git a/cpp/include/raft/sparse/linalg/detail/symmetrize.cuh b/cpp/include/raft/sparse/linalg/detail/symmetrize.cuh
index a59fba8f8e..513b71d0b2 100644
--- a/cpp/include/raft/sparse/linalg/detail/symmetrize.cuh
+++ b/cpp/include/raft/sparse/linalg/detail/symmetrize.cuh
@@ -47,8 +47,8 @@ namespace detail {
 
 // TODO: value_idx param needs to be used for this once FAISS is updated to use float32
 // for indices so that the index types can be uniform
-template <int TPB_X = 128, typename T, typename Lambda>
-RAFT_KERNEL coo_symmetrize_kernel(uint64_t* row_ind,
+template <int TPB_X = 128, typename T, typename Lambda, typename nnz_t>
+RAFT_KERNEL coo_symmetrize_kernel(nnz_t* row_ind,
                                   int* rows,
                                   int* cols,
                                   T* vals,
@@ -56,31 +56,31 @@ RAFT_KERNEL coo_symmetrize_kernel(uint64_t* row_ind,
                                   int* ocols,
                                   T* ovals,
                                   int n,
-                                  uint64_t cnnz,
+                                  nnz_t cnnz,
                                   Lambda reduction_op)
 {
   int row = (blockIdx.x * TPB_X) + threadIdx.x;
 
   if (row < n) {
-    uint64_t start_idx = row_ind[row];  // each thread processes one row
-    uint64_t stop_idx  = get_stop_idx(row, n, cnnz, row_ind);
+    nnz_t start_idx = row_ind[row];  // each thread processes one row
+    nnz_t stop_idx  = get_stop_idx(row, n, cnnz, row_ind);
 
-    uint64_t row_nnz       = 0;
-    uint64_t out_start_idx = start_idx * 2;
+    nnz_t row_nnz       = 0;
+    nnz_t out_start_idx = start_idx * 2;
 
     for (uint64_t idx = 0; idx < stop_idx - start_idx; idx++) {
       int cur_row = rows[start_idx + idx];
       int cur_col = cols[start_idx + idx];
       T cur_val   = vals[start_idx + idx];
 
-      int lookup_row   = cur_col;
-      uint64_t t_start = row_ind[lookup_row];  // Start at
-      uint64_t t_stop  = get_stop_idx(lookup_row, n, cnnz, row_ind);
+      int lookup_row = cur_col;
+      nnz_t t_start  = row_ind[lookup_row];  // Start at
+      nnz_t t_stop   = get_stop_idx(lookup_row, n, cnnz, row_ind);
 
       T transpose = 0.0;
 
       bool found_match = false;
-      for (uint64_t t_idx = t_start; t_idx < t_stop; t_idx++) {
+      for (nnz_t t_idx = t_start; t_idx < t_stop; t_idx++) {
         // If we find a match, let's get out of the loop. We won't
         // need to modify the transposed value, since that will be
         // done in a different thread.
diff --git a/cpp/include/raft/sparse/linalg/spectral.cuh b/cpp/include/raft/sparse/linalg/spectral.cuh
index 33ba9cd340..c63d551bf2 100644
--- a/cpp/include/raft/sparse/linalg/spectral.cuh
+++ b/cpp/include/raft/sparse/linalg/spectral.cuh
@@ -23,13 +23,13 @@ namespace raft {
 namespace sparse {
 namespace spectral {
 
-template <typename T>
+template <typename T, typename IndT, typename nnz_t>
 void fit_embedding(raft::resources const& handle,
                    int* rows,
                    int* cols,
                    T* vals,
-                   uint64_t nnz,
-                   int n,
+                   nnz_t nnz,
+                   IndT n,
                    int n_components,
                    T* out,
                    unsigned long long seed = 1234567)
diff --git a/cpp/include/raft/sparse/op/detail/filter.cuh b/cpp/include/raft/sparse/op/detail/filter.cuh
index 41af0315d8..881db377f9 100644
--- a/cpp/include/raft/sparse/op/detail/filter.cuh
+++ b/cpp/include/raft/sparse/op/detail/filter.cuh
@@ -42,27 +42,27 @@ namespace sparse {
 namespace op {
 namespace detail {
 
-template <int TPB_X, typename T>
+template <int TPB_X, typename T, typename nnz_t>
 RAFT_KERNEL coo_remove_scalar_kernel(const int* rows,
                                      const int* cols,
                                      const T* vals,
-                                     uint64_t nnz,
+                                     nnz_t nnz,
                                      int* out_rows,
                                      int* out_cols,
                                      T* out_vals,
-                                     uint64_t* ex_scan,
-                                     uint64_t* cur_ex_scan,
+                                     nnz_t* ex_scan,
+                                     nnz_t* cur_ex_scan,
                                      int m,
                                      T scalar)
 {
   int row = (blockIdx.x * TPB_X) + threadIdx.x;
 
   if (row < m) {
-    uint64_t start       = cur_ex_scan[row];
-    uint64_t stop        = get_stop_idx(row, m, nnz, cur_ex_scan);
-    uint64_t cur_out_idx = ex_scan[row];
+    nnz_t start       = cur_ex_scan[row];
+    nnz_t stop        = get_stop_idx(row, m, nnz, cur_ex_scan);
+    nnz_t cur_out_idx = ex_scan[row];
 
-    for (uint64_t idx = start; idx < stop; idx++) {
+    for (nnz_t idx = start; idx < stop; idx++) {
       if (vals[idx] != scalar) {
         out_rows[cur_out_idx] = rows[idx];
         out_cols[cur_out_idx] = cols[idx];
@@ -90,32 +90,32 @@ RAFT_KERNEL coo_remove_scalar_kernel(const int* rows,
  * @param d_alloc device allocator for temporary buffers
  * @param stream: cuda stream to use
  */
-template <int TPB_X, typename T>
+template <int TPB_X, typename T, typename nnz_t>
 void coo_remove_scalar(const int* rows,
                        const int* cols,
                        const T* vals,
-                       uint64_t nnz,
+                       nnz_t nnz,
                        int* crows,
                        int* ccols,
                        T* cvals,
-                       uint64_t* cnnz,
-                       uint64_t* cur_cnnz,
+                       nnz_t* cnnz,
+                       nnz_t* cur_cnnz,
                        T scalar,
                        int n,
                        cudaStream_t stream)
 {
   rmm::device_uvector<uint64_t> ex_scan(n, stream);
   rmm::device_uvector<uint64_t> cur_ex_scan(n, stream);
-  RAFT_CUDA_TRY(cudaMemsetAsync(ex_scan.data(), 0, (uint64_t)n * sizeof(uint64_t), stream));
-  RAFT_CUDA_TRY(cudaMemsetAsync(cur_ex_scan.data(), 0, (uint64_t)n * sizeof(uint64_t), stream));
+  RAFT_CUDA_TRY(cudaMemsetAsync(ex_scan.data(), 0, (nnz_t)n * sizeof(uint64_t), stream));
+  RAFT_CUDA_TRY(cudaMemsetAsync(cur_ex_scan.data(), 0, (nnz_t)n * sizeof(uint64_t), stream));
 
-  thrust::device_ptr<uint64_t> dev_cnnz    = thrust::device_pointer_cast(cnnz);
-  thrust::device_ptr<uint64_t> dev_ex_scan = thrust::device_pointer_cast(ex_scan.data());
+  thrust::device_ptr<nnz_t> dev_cnnz    = thrust::device_pointer_cast(cnnz);
+  thrust::device_ptr<nnz_t> dev_ex_scan = thrust::device_pointer_cast(ex_scan.data());
   thrust::exclusive_scan(rmm::exec_policy(stream), dev_cnnz, dev_cnnz + n, dev_ex_scan);
   RAFT_CUDA_TRY(cudaPeekAtLastError());
 
-  thrust::device_ptr<uint64_t> dev_cur_cnnz    = thrust::device_pointer_cast(cur_cnnz);
-  thrust::device_ptr<uint64_t> dev_cur_ex_scan = thrust::device_pointer_cast(cur_ex_scan.data());
+  thrust::device_ptr<nnz_t> dev_cur_cnnz    = thrust::device_pointer_cast(cur_cnnz);
+  thrust::device_ptr<nnz_t> dev_cur_ex_scan = thrust::device_pointer_cast(cur_ex_scan.data());
   thrust::exclusive_scan(rmm::exec_policy(stream), dev_cur_cnnz, dev_cur_cnnz + n, dev_cur_ex_scan);
   RAFT_CUDA_TRY(cudaPeekAtLastError());
 
@@ -147,22 +147,26 @@ void coo_remove_scalar(const int* rows,
 template <int TPB_X, typename T>
 void coo_remove_scalar(COO<T>* in, COO<T>* out, T scalar, cudaStream_t stream)
 {
-  rmm::device_uvector<uint64_t> row_count_nz(in->n_rows, stream);
-  rmm::device_uvector<uint64_t> row_count(in->n_rows, stream);
+  rmm::device_uvector<nnz_t> row_count_nz(in->n_rows, stream);
+  rmm::device_uvector<nnz_t> row_count(in->n_rows, stream);
 
   RAFT_CUDA_TRY(
     cudaMemsetAsync(row_count_nz.data(), 0, (uint64_t)in->n_rows * sizeof(uint64_t), stream));
   RAFT_CUDA_TRY(
     cudaMemsetAsync(row_count.data(), 0, (uint64_t)in->n_rows * sizeof(uint64_t), stream));
 
-  linalg::coo_degree(in->rows(), in->nnz, row_count.data(), stream);
+  linalg::coo_degree(in->rows(), in->safe_nnz, row_count.data(), stream);
   RAFT_CUDA_TRY(cudaPeekAtLastError());
 
-  linalg::coo_degree_scalar(
-    in->rows(), in->vals(), in->nnz, scalar, (unsigned long long int*)row_count_nz.data(), stream);
+  linalg::coo_degree_scalar(in->rows(),
+                            in->vals(),
+                            in->safe_nnz,
+                            scalar,
+                            (unsigned long long int*)row_count_nz.data(),
+                            stream);
   RAFT_CUDA_TRY(cudaPeekAtLastError());
 
-  thrust::device_ptr<uint64_t> d_row_count_nz = thrust::device_pointer_cast(row_count_nz.data());
+  thrust::device_ptr<nnz_t> d_row_count_nz = thrust::device_pointer_cast(row_count_nz.data());
   uint64_t out_nnz =
     thrust::reduce(rmm::exec_policy(stream), d_row_count_nz, d_row_count_nz + in->n_rows);
 
diff --git a/cpp/include/raft/sparse/op/detail/sort.h b/cpp/include/raft/sparse/op/detail/sort.h
index 7d09ebeddc..6464ad9374 100644
--- a/cpp/include/raft/sparse/op/detail/sort.h
+++ b/cpp/include/raft/sparse/op/detail/sort.h
@@ -68,8 +68,8 @@ struct TupleComp {
  * @param vals vals array from coo matrix
  * @param stream: cuda stream to use
  */
-template <typename T, typename IdxT = int>
-void coo_sort(IdxT m, IdxT n, uint64_t nnz, IdxT* rows, IdxT* cols, T* vals, cudaStream_t stream)
+template <typename T, typename IdxT = int, typename nnz_t>
+void coo_sort(IdxT m, IdxT n, nnz_t nnz, IdxT* rows, IdxT* cols, T* vals, cudaStream_t stream)
 {
   auto coo_indices = thrust::make_zip_iterator(thrust::make_tuple(rows, cols));
 
@@ -86,7 +86,8 @@ void coo_sort(IdxT m, IdxT n, uint64_t nnz, IdxT* rows, IdxT* cols, T* vals, cud
 template <typename T, typename IdxT = int>
 void coo_sort(COO<T, IdxT>* const in, cudaStream_t stream)
 {
-  coo_sort<T, IdxT>(in->n_rows, in->n_cols, in->nnz, in->rows(), in->cols(), in->vals(), stream);
+  coo_sort<T, IdxT>(
+    in->n_rows, in->n_cols, in->safe_nnz, in->rows(), in->cols(), in->vals(), stream);
 }
 
 /**
@@ -99,9 +100,9 @@ void coo_sort(COO<T, IdxT>* const in, cudaStream_t stream)
  * @param[in] nnz number of edges in edge list
  * @param[in] stream cuda stream for which to order cuda operations
  */
-template <typename value_idx, typename value_t>
+template <typename value_idx, typename value_t, typename nnz_t>
 void coo_sort_by_weight(
-  value_idx* rows, value_idx* cols, value_t* data, value_idx nnz, cudaStream_t stream)
+  value_idx* rows, value_idx* cols, value_t* data, nnz_t nnz, cudaStream_t stream)
 {
   thrust::device_ptr<value_t> t_data = thrust::device_pointer_cast(data);
 
diff --git a/cpp/include/raft/sparse/op/sort.cuh b/cpp/include/raft/sparse/op/sort.cuh
index 35b5fd9f31..2a6cb52c83 100644
--- a/cpp/include/raft/sparse/op/sort.cuh
+++ b/cpp/include/raft/sparse/op/sort.cuh
@@ -37,8 +37,8 @@ namespace op {
  * @param vals vals array from coo matrix
  * @param stream: cuda stream to use
  */
-template <typename T, typename IdxT = int>
-void coo_sort(IdxT m, IdxT n, uint64_t nnz, IdxT* rows, IdxT* cols, T* vals, cudaStream_t stream)
+template <typename T, typename IdxT = int, typename nnz_t>
+void coo_sort(IdxT m, IdxT n, nnz_t nnz, IdxT* rows, IdxT* cols, T* vals, cudaStream_t stream)
 {
   detail::coo_sort(m, n, nnz, rows, cols, vals, stream);
 }
@@ -52,7 +52,8 @@ void coo_sort(IdxT m, IdxT n, uint64_t nnz, IdxT* rows, IdxT* cols, T* vals, cud
 template <typename T, typename IdxT = int>
 void coo_sort(COO<T, IdxT>* const in, cudaStream_t stream)
 {
-  coo_sort<T, IdxT>(in->n_rows, in->n_cols, in->nnz, in->rows(), in->cols(), in->vals(), stream);
+  coo_sort<T, IdxT>(
+    in->n_rows, in->n_cols, in->safe_nnz, in->rows(), in->cols(), in->vals(), stream);
 }
 
 /**
@@ -65,9 +66,9 @@ void coo_sort(COO<T, IdxT>* const in, cudaStream_t stream)
  * @param[in] nnz number of edges in edge list
  * @param[in] stream cuda stream for which to order cuda operations
  */
-template <typename value_idx, typename value_t>
+template <typename value_idx, typename value_t, typename nnz_t>
 void coo_sort_by_weight(
-  value_idx* rows, value_idx* cols, value_t* data, value_idx nnz, cudaStream_t stream)
+  value_idx* rows, value_idx* cols, value_t* data, nnz_t nnz, cudaStream_t stream)
 {
   detail::coo_sort_by_weight(rows, cols, data, nnz, stream);
 }

From ccb3b93c8629ac151f93a12c8034fec42884c6d0 Mon Sep 17 00:00:00 2001
From: "Corey J. Nolet" <cjnolet@gmail.com>
Date: Wed, 29 Jan 2025 15:56:22 -0500
Subject: [PATCH 10/25] Adding explicit types ot COO object

---
 cpp/include/raft/sparse/coo.hpp               |  4 +-
 cpp/include/raft/sparse/detail/coo.cuh        | 69 +++++++------------
 .../raft/sparse/linalg/detail/degree.cuh      |  4 +-
 .../raft/sparse/linalg/detail/symmetrize.cuh  |  4 +-
 cpp/include/raft/sparse/op/detail/filter.cuh  |  8 +--
 5 files changed, 36 insertions(+), 53 deletions(-)

diff --git a/cpp/include/raft/sparse/coo.hpp b/cpp/include/raft/sparse/coo.hpp
index a176fefc3e..1c61117dab 100644
--- a/cpp/include/raft/sparse/coo.hpp
+++ b/cpp/include/raft/sparse/coo.hpp
@@ -39,8 +39,8 @@ namespace sparse {
  * @tparam value_idx: the type of index array
  *
  */
-template <typename value_t, typename value_idx = int>
-using COO = detail::COO<value_t, value_idx>;
+template <typename value_t, typename value_idx = int, typename nnz_t = uint64_t>
+using COO = detail::COO<value_t, value_idx, nnz_t>;
 
 };  // namespace sparse
 };  // namespace raft
diff --git a/cpp/include/raft/sparse/detail/coo.cuh b/cpp/include/raft/sparse/detail/coo.cuh
index 38cd33cb3e..deedffa636 100644
--- a/cpp/include/raft/sparse/detail/coo.cuh
+++ b/cpp/include/raft/sparse/detail/coo.cuh
@@ -44,7 +44,7 @@ namespace detail {
  * @tparam Index_Type: the type of index array
  *
  */
-template <typename T, typename Index_Type = int>
+template <typename T, typename Index_Type = int, typename nnz_type = uint64_t>
 class COO {
  protected:
   rmm::device_uvector<Index_Type> rows_arr;
@@ -52,8 +52,11 @@ class COO {
   rmm::device_uvector<T> vals_arr;
 
  public:
-  Index_Type nnz;
-  uint64_t safe_nnz;
+  using value_t = T;
+  using index_t = Index_Type;
+  using nnz_t   = nnz_type;
+
+  nnz_type nnz;
   Index_Type n_rows;
   Index_Type n_cols;
 
@@ -61,13 +64,7 @@ class COO {
    * @param stream: CUDA stream to use
    */
   COO(cudaStream_t stream)
-    : rows_arr(0, stream),
-      cols_arr(0, stream),
-      vals_arr(0, stream),
-      nnz(0),
-      n_rows(0),
-      n_cols(0),
-      safe_nnz(0)
+    : rows_arr(0, stream), cols_arr(0, stream), vals_arr(0, stream), nnz(0), n_rows(0), n_cols(0)
   {
   }
 
@@ -79,20 +76,13 @@ class COO {
    * @param n_rows: number of rows in the dense matrix
    * @param n_cols: number of cols in the dense matrix
    */
-  template <typename SafeNNZ_Type>
   COO(rmm::device_uvector<Index_Type>& rows,
       rmm::device_uvector<Index_Type>& cols,
       rmm::device_uvector<T>& vals,
-      SafeNNZ_Type nnz,
+      nnz_type nnz,
       Index_Type n_rows = 0,
       Index_Type n_cols = 0)
-    : rows_arr(rows),
-      cols_arr(cols),
-      vals_arr(vals),
-      nnz((Index_Type)nnz),
-      n_rows(n_rows),
-      n_cols(n_cols),
-      safe_nnz((uint64_t)nnz)
+    : rows_arr(rows), cols_arr(cols), vals_arr(vals), nnz(nnz), n_rows(n_rows), n_cols(n_cols)
   {
   }
 
@@ -103,9 +93,8 @@ class COO {
    * @param n_cols: number of cols in the dense matrix
    * @param init: initialize arrays with zeros
    */
-  template <typename SafeNNZ_Type>
   COO(cudaStream_t stream,
-      SafeNNZ_Type nnz,
+      nnz_type nnz,
       Index_Type n_rows = 0,
       Index_Type n_cols = 0,
       bool init         = true)
@@ -114,8 +103,7 @@ class COO {
       vals_arr(nnz, stream),
       nnz(nnz),
       n_rows(n_rows),
-      n_cols(n_cols),
-      safe_nnz(nnz)
+      n_cols(n_cols)
   {
     if (init) init_arrays(stream);
   }
@@ -123,10 +111,10 @@ class COO {
   void init_arrays(cudaStream_t stream)
   {
     RAFT_CUDA_TRY(
-      cudaMemsetAsync(this->rows_arr.data(), 0, this->safe_nnz * sizeof(Index_Type), stream));
+      cudaMemsetAsync(this->rows_arr.data(), 0, this - nnz * sizeof(Index_Type), stream));
     RAFT_CUDA_TRY(
-      cudaMemsetAsync(this->cols_arr.data(), 0, this->safe_nnz * sizeof(Index_Type), stream));
-    RAFT_CUDA_TRY(cudaMemsetAsync(this->vals_arr.data(), 0, this->safe_nnz * sizeof(T), stream));
+      cudaMemsetAsync(this->cols_arr.data(), 0, this->nnz * sizeof(Index_Type), stream));
+    RAFT_CUDA_TRY(cudaMemsetAsync(this->vals_arr.data(), 0, this->nnz * sizeof(T), stream));
   }
 
   ~COO() {}
@@ -137,7 +125,7 @@ class COO {
    */
   bool validate_size() const
   {
-    if (this->safe_nnz <= 0 || n_rows <= 0 || n_cols <= 0) return false;
+    if (this->nnz <= 0 || n_rows <= 0 || n_cols <= 0) return false;
     return true;
   }
 
@@ -172,16 +160,16 @@ class COO {
   /**
    * @brief Send human-readable state information to output stream
    */
-  friend std::ostream& operator<<(std::ostream& out, const COO<T, Index_Type>& c)
+  friend std::ostream& operator<<(std::ostream& out, const COO<T, Index_Type, nnz_type>& c)
   {
     if (c.validate_size() && c.validate_mem()) {
       cudaStream_t stream;
       RAFT_CUDA_TRY(cudaStreamCreateWithFlags(&stream, cudaStreamNonBlocking));
 
-      out << raft::arr2Str(c.rows_arr.data(), c.safe_nnz, "rows", stream) << std::endl;
-      out << raft::arr2Str(c.cols_arr.data(), c.safe_nnz, "cols", stream) << std::endl;
-      out << raft::arr2Str(c.vals_arr.data(), c.safe_nnz, "vals", stream) << std::endl;
-      out << "nnz=" << c.safe_nnz << std::endl;
+      out << raft::arr2Str(c.rows_arr.data(), c.nnz, "rows", stream) << std::endl;
+      out << raft::arr2Str(c.cols_arr.data(), c.nnz, "cols", stream) << std::endl;
+      out << raft::arr2Str(c.vals_arr.data(), c.nnz, "vals", stream) << std::endl;
+      out << "nnz=" << c.nnz << std::endl;
       out << "n_rows=" << c.n_rows << std::endl;
       out << "n_cols=" << c.n_cols << std::endl;
 
@@ -220,8 +208,7 @@ class COO {
    * @param init: should values be initialized to 0?
    * @param stream: CUDA stream to use
    */
-  template <typename SafeNNZ_Type>
-  void allocate(SafeNNZ_Type nnz, bool init, cudaStream_t stream)
+  void allocate(nnz_type nnz, bool init, cudaStream_t stream)
   {
     this->allocate(nnz, 0, init, stream);
   }
@@ -233,8 +220,7 @@ class COO {
    * @param init: should values be initialized to 0?
    * @param stream: CUDA stream to use
    */
-  template <typename SafeNNZ_Type>
-  void allocate(SafeNNZ_Type nnz, Index_Type size, bool init, cudaStream_t stream)
+  void allocate(nnz_type nnz, Index_Type size, bool init, cudaStream_t stream)
   {
     this->allocate(nnz, size, size, init, stream);
   }
@@ -247,14 +233,11 @@ class COO {
    * @param init: should values be initialized to 0?
    * @param stream: stream to use for init
    */
-  template <typename SafeNNZ_Type>
-  void allocate(
-    SafeNNZ_Type nnz, Index_Type n_rows, Index_Type n_cols, bool init, cudaStream_t stream)
+  void allocate(nnz_type nnz, Index_Type n_rows, Index_Type n_cols, bool init, cudaStream_t stream)
   {
-    this->n_rows   = n_rows;
-    this->n_cols   = n_cols;
-    this->nnz      = (Index_Type)nnz;
-    this->safe_nnz = nnz;
+    this->n_rows = n_rows;
+    this->n_cols = n_cols;
+    this->nnz    = nnz;
 
     this->rows_arr.resize(nnz, stream);
     this->cols_arr.resize(nnz, stream);
diff --git a/cpp/include/raft/sparse/linalg/detail/degree.cuh b/cpp/include/raft/sparse/linalg/detail/degree.cuh
index 0163e15878..6a83ddae42 100644
--- a/cpp/include/raft/sparse/linalg/detail/degree.cuh
+++ b/cpp/include/raft/sparse/linalg/detail/degree.cuh
@@ -54,10 +54,10 @@ RAFT_KERNEL coo_degree_kernel(const T* rows, nnz_t nnz, outT* results)
  * @param results: output result array
  * @param stream: cuda stream to use
  */
-template <uint64_t TPB_X = 64, typename T = int, typename outT, typename nnz_t>
+template <int TPB_X = 64, typename T = int, typename outT, typename nnz_t>
 void coo_degree(const T* rows, nnz_t nnz, outT* results, cudaStream_t stream)
 {
-  dim3 grid_rc(raft::ceildiv(nnz, TPB_X), 1, 1);
+  dim3 grid_rc(raft::ceildiv((nnz_t)nnz, (nnz_t)TPB_X), 1, 1);
   dim3 blk_rc(TPB_X, 1, 1);
 
   coo_degree_kernel<TPB_X><<<grid_rc, blk_rc, 0, stream>>>(rows, nnz, results);
diff --git a/cpp/include/raft/sparse/linalg/detail/symmetrize.cuh b/cpp/include/raft/sparse/linalg/detail/symmetrize.cuh
index 513b71d0b2..af9102fa2f 100644
--- a/cpp/include/raft/sparse/linalg/detail/symmetrize.cuh
+++ b/cpp/include/raft/sparse/linalg/detail/symmetrize.cuh
@@ -68,7 +68,7 @@ RAFT_KERNEL coo_symmetrize_kernel(nnz_t* row_ind,
     nnz_t row_nnz       = 0;
     nnz_t out_start_idx = start_idx * 2;
 
-    for (uint64_t idx = 0; idx < stop_idx - start_idx; idx++) {
+    for (nnz_t idx = 0; idx < stop_idx - start_idx; idx++) {
       int cur_row = rows[start_idx + idx];
       int cur_col = cols[start_idx + idx];
       T cur_val   = vals[start_idx + idx];
@@ -156,7 +156,7 @@ void coo_symmetrize(COO<T>* in,
                                                             out->cols(),
                                                             out->vals(),
                                                             in->n_rows,
-                                                            in->nnz,
+                                                            in->safe_nnz,
                                                             reduction_op);
   RAFT_CUDA_TRY(cudaPeekAtLastError());
 }
diff --git a/cpp/include/raft/sparse/op/detail/filter.cuh b/cpp/include/raft/sparse/op/detail/filter.cuh
index 881db377f9..db82c8ea7f 100644
--- a/cpp/include/raft/sparse/op/detail/filter.cuh
+++ b/cpp/include/raft/sparse/op/detail/filter.cuh
@@ -144,11 +144,11 @@ void coo_remove_scalar(const int* rows,
  * @param scalar: scalar to remove from arrays
  * @param stream: cuda stream to use
  */
-template <int TPB_X, typename T>
+template <int TPB_X, typename T, typename nnz_t>
 void coo_remove_scalar(COO<T>* in, COO<T>* out, T scalar, cudaStream_t stream)
 {
-  rmm::device_uvector<nnz_t> row_count_nz(in->n_rows, stream);
-  rmm::device_uvector<nnz_t> row_count(in->n_rows, stream);
+  rmm::device_uvector<uint64_t> row_count_nz(in->n_rows, stream);
+  rmm::device_uvector<uint64_t> row_count(in->n_rows, stream);
 
   RAFT_CUDA_TRY(
     cudaMemsetAsync(row_count_nz.data(), 0, (uint64_t)in->n_rows * sizeof(uint64_t), stream));
@@ -166,7 +166,7 @@ void coo_remove_scalar(COO<T>* in, COO<T>* out, T scalar, cudaStream_t stream)
                             stream);
   RAFT_CUDA_TRY(cudaPeekAtLastError());
 
-  thrust::device_ptr<nnz_t> d_row_count_nz = thrust::device_pointer_cast(row_count_nz.data());
+  thrust::device_ptr<uint64_t> d_row_count_nz = thrust::device_pointer_cast(row_count_nz.data());
   uint64_t out_nnz =
     thrust::reduce(rmm::exec_policy(stream), d_row_count_nz, d_row_count_nz + in->n_rows);
 

From b831e4057c2f92c0890f947c273cb04b805dba91 Mon Sep 17 00:00:00 2001
From: "Corey J. Nolet" <cjnolet@gmail.com>
Date: Wed, 29 Jan 2025 16:28:42 -0500
Subject: [PATCH 11/25] Updating COO object and sparse primitives to include an
 NNZ type

---
 cpp/include/raft/sparse/convert/coo.cuh       |  6 +-
 cpp/include/raft/sparse/convert/csr.cuh       |  2 +-
 .../raft/sparse/convert/detail/coo.cuh        |  4 +-
 cpp/include/raft/sparse/detail/coo.cuh        |  2 +-
 .../raft/sparse/linalg/detail/symmetrize.cuh  |  2 +-
 cpp/include/raft/sparse/op/detail/filter.cuh  | 71 +++++++++----------
 cpp/include/raft/sparse/op/detail/sort.h      |  8 +--
 cpp/include/raft/sparse/op/filter.cuh         | 33 +++++----
 cpp/include/raft/sparse/op/sort.cuh           |  8 +--
 cpp/tests/sparse/solver/lanczos.cu            |  4 +-
 cpp/tests/sparse/symmetrize.cu                | 20 +++---
 11 files changed, 80 insertions(+), 80 deletions(-)

diff --git a/cpp/include/raft/sparse/convert/coo.cuh b/cpp/include/raft/sparse/convert/coo.cuh
index ba3efc7ff0..69f7864602 100644
--- a/cpp/include/raft/sparse/convert/coo.cuh
+++ b/cpp/include/raft/sparse/convert/coo.cuh
@@ -32,11 +32,11 @@ namespace convert {
  * @param nnz: size of output COO row array
  * @param stream: cuda stream to use
  */
-template <typename value_idx = int>
+template <typename value_idx = int, typename nnz_t>
 void csr_to_coo(
-  const value_idx* row_ind, value_idx m, value_idx* coo_rows, value_idx nnz, cudaStream_t stream)
+  const value_idx* row_ind, value_idx m, value_idx* coo_rows, nnz_t nnz, cudaStream_t stream)
 {
-  detail::csr_to_coo<value_idx, 32>(row_ind, m, coo_rows, nnz, stream);
+  detail::csr_to_coo<value_idx, nnz_t, 32>(row_ind, m, coo_rows, nnz, stream);
 }
 
 };  // end NAMESPACE convert
diff --git a/cpp/include/raft/sparse/convert/csr.cuh b/cpp/include/raft/sparse/convert/csr.cuh
index 53a025ec71..62137d04b7 100644
--- a/cpp/include/raft/sparse/convert/csr.cuh
+++ b/cpp/include/raft/sparse/convert/csr.cuh
@@ -70,7 +70,7 @@ void sorted_coo_to_csr(const T* rows, nnz_type nnz, outT* row_ind, int m, cudaSt
 template <typename T, typename outT>
 void sorted_coo_to_csr(COO<T>* coo, outT* row_ind, cudaStream_t stream)
 {
-  detail::sorted_coo_to_csr(coo->rows(), coo->safe_nnz, row_ind, coo->n_rows, stream);
+  detail::sorted_coo_to_csr(coo->rows(), coo->nnz, row_ind, coo->n_rows, stream);
 }
 
 /**
diff --git a/cpp/include/raft/sparse/convert/detail/coo.cuh b/cpp/include/raft/sparse/convert/detail/coo.cuh
index 469dac3c86..009762b4df 100644
--- a/cpp/include/raft/sparse/convert/detail/coo.cuh
+++ b/cpp/include/raft/sparse/convert/detail/coo.cuh
@@ -60,9 +60,9 @@ RAFT_KERNEL csr_to_coo_kernel(const value_idx* row_ind,
  * @param nnz: size of output COO row array
  * @param stream: cuda stream to use
  */
-template <typename value_idx = int, int TPB_X = 32>
+template <typename value_idx = int, typename nnz_t, int TPB_X = 32>
 void csr_to_coo(
-  const value_idx* row_ind, value_idx m, value_idx* coo_rows, value_idx nnz, cudaStream_t stream)
+  const value_idx* row_ind, value_idx m, value_idx* coo_rows, nnz_t nnz, cudaStream_t stream)
 {
   // @TODO: Use cusparse for this.
   dim3 grid(raft::ceildiv(m, (value_idx)TPB_X), 1, 1);
diff --git a/cpp/include/raft/sparse/detail/coo.cuh b/cpp/include/raft/sparse/detail/coo.cuh
index deedffa636..5e1514d228 100644
--- a/cpp/include/raft/sparse/detail/coo.cuh
+++ b/cpp/include/raft/sparse/detail/coo.cuh
@@ -111,7 +111,7 @@ class COO {
   void init_arrays(cudaStream_t stream)
   {
     RAFT_CUDA_TRY(
-      cudaMemsetAsync(this->rows_arr.data(), 0, this - nnz * sizeof(Index_Type), stream));
+      cudaMemsetAsync(this->rows_arr.data(), 0, this->nnz * sizeof(Index_Type), stream));
     RAFT_CUDA_TRY(
       cudaMemsetAsync(this->cols_arr.data(), 0, this->nnz * sizeof(Index_Type), stream));
     RAFT_CUDA_TRY(cudaMemsetAsync(this->vals_arr.data(), 0, this->nnz * sizeof(T), stream));
diff --git a/cpp/include/raft/sparse/linalg/detail/symmetrize.cuh b/cpp/include/raft/sparse/linalg/detail/symmetrize.cuh
index af9102fa2f..e60db71bdb 100644
--- a/cpp/include/raft/sparse/linalg/detail/symmetrize.cuh
+++ b/cpp/include/raft/sparse/linalg/detail/symmetrize.cuh
@@ -156,7 +156,7 @@ void coo_symmetrize(COO<T>* in,
                                                             out->cols(),
                                                             out->vals(),
                                                             in->n_rows,
-                                                            in->safe_nnz,
+                                                            in->nnz,
                                                             reduction_op);
   RAFT_CUDA_TRY(cudaPeekAtLastError());
 }
diff --git a/cpp/include/raft/sparse/op/detail/filter.cuh b/cpp/include/raft/sparse/op/detail/filter.cuh
index db82c8ea7f..e0d072e8bb 100644
--- a/cpp/include/raft/sparse/op/detail/filter.cuh
+++ b/cpp/include/raft/sparse/op/detail/filter.cuh
@@ -90,18 +90,18 @@ RAFT_KERNEL coo_remove_scalar_kernel(const int* rows,
  * @param d_alloc device allocator for temporary buffers
  * @param stream: cuda stream to use
  */
-template <int TPB_X, typename T, typename nnz_t>
-void coo_remove_scalar(const int* rows,
-                       const int* cols,
+template <int TPB_X, typename T, typename idx_t, typename nnz_t>
+void coo_remove_scalar(const idx_t* rows,
+                       const idx_t* cols,
                        const T* vals,
                        nnz_t nnz,
-                       int* crows,
-                       int* ccols,
+                       idx_t* crows,
+                       idx_t* ccols,
                        T* cvals,
                        nnz_t* cnnz,
                        nnz_t* cur_cnnz,
                        T scalar,
-                       int n,
+                       idx_t n,
                        cudaStream_t stream)
 {
   rmm::device_uvector<uint64_t> ex_scan(n, stream);
@@ -144,46 +144,43 @@ void coo_remove_scalar(const int* rows,
  * @param scalar: scalar to remove from arrays
  * @param stream: cuda stream to use
  */
-template <int TPB_X, typename T, typename nnz_t>
-void coo_remove_scalar(COO<T>* in, COO<T>* out, T scalar, cudaStream_t stream)
+template <int TPB_X, typename T, typename idx_t, typename nnz_t>
+void coo_remove_scalar(COO<T, idx_t, nnz_t>* in,
+                       COO<T, idx_t, nnz_t>* out,
+                       T scalar,
+                       cudaStream_t stream)
 {
-  rmm::device_uvector<uint64_t> row_count_nz(in->n_rows, stream);
-  rmm::device_uvector<uint64_t> row_count(in->n_rows, stream);
+  rmm::device_uvector<nnz_t> row_count_nz(in->n_rows, stream);
+  rmm::device_uvector<nnz_t> row_count(in->n_rows, stream);
 
-  RAFT_CUDA_TRY(
-    cudaMemsetAsync(row_count_nz.data(), 0, (uint64_t)in->n_rows * sizeof(uint64_t), stream));
-  RAFT_CUDA_TRY(
-    cudaMemsetAsync(row_count.data(), 0, (uint64_t)in->n_rows * sizeof(uint64_t), stream));
+  RAFT_CUDA_TRY(cudaMemsetAsync(row_count_nz.data(), 0, (idx_t)in->n_rows * sizeof(idx_t), stream));
+  RAFT_CUDA_TRY(cudaMemsetAsync(row_count.data(), 0, (idx_t)in->n_rows * sizeof(idx_t), stream));
 
-  linalg::coo_degree(in->rows(), in->safe_nnz, row_count.data(), stream);
+  linalg::coo_degree(in->rows(), in->nnz, row_count.data(), stream);
   RAFT_CUDA_TRY(cudaPeekAtLastError());
 
-  linalg::coo_degree_scalar(in->rows(),
-                            in->vals(),
-                            in->safe_nnz,
-                            scalar,
-                            (unsigned long long int*)row_count_nz.data(),
-                            stream);
+  linalg::coo_degree_scalar(
+    in->rows(), in->vals(), in->nnz, scalar, (unsigned long long int*)row_count_nz.data(), stream);
   RAFT_CUDA_TRY(cudaPeekAtLastError());
 
-  thrust::device_ptr<uint64_t> d_row_count_nz = thrust::device_pointer_cast(row_count_nz.data());
+  thrust::device_ptr<nnz_t> d_row_count_nz = thrust::device_pointer_cast(row_count_nz.data());
   uint64_t out_nnz =
     thrust::reduce(rmm::exec_policy(stream), d_row_count_nz, d_row_count_nz + in->n_rows);
 
   out->allocate(out_nnz, in->n_rows, in->n_cols, false, stream);
 
-  coo_remove_scalar<TPB_X, T>(in->rows(),
-                              in->cols(),
-                              in->vals(),
-                              in->nnz,
-                              out->rows(),
-                              out->cols(),
-                              out->vals(),
-                              row_count_nz.data(),
-                              row_count.data(),
-                              scalar,
-                              in->n_rows,
-                              stream);
+  coo_remove_scalar<TPB_X, T, idx_t, nnz_t>(in->rows(),
+                                            in->cols(),
+                                            in->vals(),
+                                            in->nnz,
+                                            out->rows(),
+                                            out->cols(),
+                                            out->vals(),
+                                            row_count_nz.data(),
+                                            row_count.data(),
+                                            scalar,
+                                            in->n_rows,
+                                            stream);
   RAFT_CUDA_TRY(cudaPeekAtLastError());
 }
 
@@ -194,10 +191,10 @@ void coo_remove_scalar(COO<T>* in, COO<T>* out, T scalar, cudaStream_t stream)
  * @param out: output COO matrix
  * @param stream: cuda stream to use
  */
-template <int TPB_X, typename T>
-void coo_remove_zeros(COO<T>* in, COO<T>* out, cudaStream_t stream)
+template <int TPB_X, typename T, typename idx_t, typename nnz_t>
+void coo_remove_zeros(COO<T, idx_t, nnz_t>* in, COO<T, idx_t, nnz_t>* out, cudaStream_t stream)
 {
-  coo_remove_scalar<TPB_X, T>(in, out, T(0.0), stream);
+  coo_remove_scalar<TPB_X, T, idx_t, nnz_t>(in, out, T(0.0), stream);
 }
 
 };  // namespace detail
diff --git a/cpp/include/raft/sparse/op/detail/sort.h b/cpp/include/raft/sparse/op/detail/sort.h
index 6464ad9374..8bd848bedb 100644
--- a/cpp/include/raft/sparse/op/detail/sort.h
+++ b/cpp/include/raft/sparse/op/detail/sort.h
@@ -83,11 +83,11 @@ void coo_sort(IdxT m, IdxT n, nnz_t nnz, IdxT* rows, IdxT* cols, T* vals, cudaSt
  * @param in: COO to sort by row
  * @param stream: the cuda stream to use
  */
-template <typename T, typename IdxT = int>
-void coo_sort(COO<T, IdxT>* const in, cudaStream_t stream)
+template <typename T, typename IdxT = int, typename nnz_t = uint64_t>
+void coo_sort(COO<T, IdxT, nnz_t>* const in, cudaStream_t stream)
 {
-  coo_sort<T, IdxT>(
-    in->n_rows, in->n_cols, in->safe_nnz, in->rows(), in->cols(), in->vals(), stream);
+  coo_sort<T, IdxT, nnz_t>(
+    in->n_rows, in->n_cols, in->nnz, in->rows(), in->cols(), in->vals(), stream);
 }
 
 /**
diff --git a/cpp/include/raft/sparse/op/filter.cuh b/cpp/include/raft/sparse/op/filter.cuh
index 4b329325ca..5541a375d2 100644
--- a/cpp/include/raft/sparse/op/filter.cuh
+++ b/cpp/include/raft/sparse/op/filter.cuh
@@ -42,21 +42,21 @@ namespace op {
  * @param n: number of rows in dense matrix
  * @param stream: cuda stream to use
  */
-template <typename T>
-void coo_remove_scalar(const int* rows,
-                       const int* cols,
+template <typename T, typename idx_t, typename nnz_t>
+void coo_remove_scalar(const idx_t* rows,
+                       const idx_t* cols,
                        const T* vals,
                        int nnz,
-                       int* crows,
-                       int* ccols,
+                       idx_t* crows,
+                       idx_t* ccols,
                        T* cvals,
-                       int* cnnz,
-                       int* cur_cnnz,
+                       nnz_t* cnnz,
+                       nnz_t* cur_cnnz,
                        T scalar,
-                       int n,
+                       idx_t n,
                        cudaStream_t stream)
 {
-  detail::coo_remove_scalar<128, T>(
+  detail::coo_remove_scalar<128, T, idx_t, nnz_t>(
     rows, cols, vals, nnz, crows, ccols, cvals, cnnz, cur_cnnz, scalar, n, stream);
 }
 
@@ -68,10 +68,13 @@ void coo_remove_scalar(const int* rows,
  * @param scalar: scalar to remove from arrays
  * @param stream: cuda stream to use
  */
-template <typename T>
-void coo_remove_scalar(COO<T>* in, COO<T>* out, T scalar, cudaStream_t stream)
+template <typename T, typename idx_t, typename nnz_t>
+void coo_remove_scalar(COO<T, idx_t, nnz_t>* in,
+                       COO<T, idx_t, nnz_t>* out,
+                       T scalar,
+                       cudaStream_t stream)
 {
-  detail::coo_remove_scalar<128, T>(in, out, scalar, stream);
+  detail::coo_remove_scalar<128, T, idx_t, nnz_t>(in, out, scalar, stream);
 }
 
 /**
@@ -81,10 +84,10 @@ void coo_remove_scalar(COO<T>* in, COO<T>* out, T scalar, cudaStream_t stream)
  * @param out: output COO matrix
  * @param stream: cuda stream to use
  */
-template <typename T>
-void coo_remove_zeros(COO<T>* in, COO<T>* out, cudaStream_t stream)
+template <typename T, typename idx_t, typename nnz_t>
+void coo_remove_zeros(COO<T, idx_t, nnz_t>* in, COO<T, idx_t, nnz_t>* out, cudaStream_t stream)
 {
-  coo_remove_scalar<T>(in, out, T(0.0), stream);
+  coo_remove_scalar<T, idx_t, nnz_t>(in, out, T(0.0), stream);
 }
 
 };  // namespace op
diff --git a/cpp/include/raft/sparse/op/sort.cuh b/cpp/include/raft/sparse/op/sort.cuh
index 2a6cb52c83..62231e561e 100644
--- a/cpp/include/raft/sparse/op/sort.cuh
+++ b/cpp/include/raft/sparse/op/sort.cuh
@@ -49,11 +49,11 @@ void coo_sort(IdxT m, IdxT n, nnz_t nnz, IdxT* rows, IdxT* cols, T* vals, cudaSt
  * @param in: COO to sort by row
  * @param stream: the cuda stream to use
  */
-template <typename T, typename IdxT = int>
-void coo_sort(COO<T, IdxT>* const in, cudaStream_t stream)
+template <typename T, typename IdxT = int, typename nnz_t>
+void coo_sort(COO<T, IdxT, nnz_t>* const in, cudaStream_t stream)
 {
-  coo_sort<T, IdxT>(
-    in->n_rows, in->n_cols, in->safe_nnz, in->rows(), in->cols(), in->vals(), stream);
+  coo_sort<T, IdxT, nnz_t>(
+    in->n_rows, in->n_cols, in->nnz, in->rows(), in->cols(), in->vals(), stream);
 }
 
 /**
diff --git a/cpp/tests/sparse/solver/lanczos.cu b/cpp/tests/sparse/solver/lanczos.cu
index 128ab73747..88bc208f0b 100644
--- a/cpp/tests/sparse/solver/lanczos.cu
+++ b/cpp/tests/sparse/solver/lanczos.cu
@@ -192,7 +192,7 @@ class rmat_lanczos_tests
       raft::make_device_matrix<ValueType, uint32_t, raft::col_major>(
         handle, symmetric_coo.n_rows, n_components);
 
-    raft::spectral::matrix::sparse_matrix_t<IndexType, ValueType> const csr_m{
+    raft::spectral::matrix::sparse_matrix_t<IndexType, ValueType, uint64_t> const csr_m{
       handle,
       row_indices.data_handle(),
       symmetric_coo.cols(),
@@ -203,7 +203,7 @@ class rmat_lanczos_tests
       n_components, params.maxiter, params.restartiter, params.tol, rng.seed};
 
     auto csr_structure =
-      raft::make_device_compressed_structure_view<IndexType, IndexType, IndexType>(
+      raft::make_device_compressed_structure_view<IndexType, IndexType, uint64_t>(
         const_cast<IndexType*>(row_indices.data_handle()),
         const_cast<IndexType*>(symmetric_coo.cols()),
         symmetric_coo.n_rows,
diff --git a/cpp/tests/sparse/symmetrize.cu b/cpp/tests/sparse/symmetrize.cu
index 5f7463ac89..358bbfaa83 100644
--- a/cpp/tests/sparse/symmetrize.cu
+++ b/cpp/tests/sparse/symmetrize.cu
@@ -32,9 +32,9 @@
 namespace raft {
 namespace sparse {
 
-template <typename value_idx, typename value_t>
+template <typename value_idx, typename value_t, typename nnz_t>
 RAFT_KERNEL assert_symmetry(
-  value_idx* rows, value_idx* cols, value_t* vals, value_idx nnz, value_idx* sum)
+  value_idx* rows, value_idx* cols, value_t* vals, nnz_t nnz, value_idx* sum)
 {
   int tid = blockDim.x * blockIdx.x + threadIdx.x;
 
@@ -60,7 +60,7 @@ template <typename value_idx, typename value_t>
   return os;
 }
 
-template <typename value_idx, typename value_t>
+template <typename value_idx, typename value_t, typename nnz_t>
 class SparseSymmetrizeTest
   : public ::testing::TestWithParam<SparseSymmetrizeInputs<value_idx, value_t>> {
  public:
@@ -93,15 +93,15 @@ class SparseSymmetrizeTest
   {
     make_data();
 
-    value_idx m   = params.indptr_h.size() - 1;
-    value_idx n   = params.n_cols;
-    value_idx nnz = params.indices_h.size();
+    value_idx m = params.indptr_h.size() - 1;
+    value_idx n = params.n_cols;
+    nnz_t nnz   = params.indices_h.size();
 
     rmm::device_uvector<value_idx> coo_rows(nnz, stream);
 
     raft::sparse::convert::csr_to_coo(indptr.data(), m, coo_rows.data(), nnz, stream);
 
-    raft::sparse::COO<value_t, value_idx> out(stream);
+    raft::sparse::COO<value_t, value_idx, nnz_t> out(stream);
 
     raft::sparse::linalg::symmetrize(
       handle, coo_rows.data(), indices.data(), data.data(), m, n, coo_rows.size(), out);
@@ -109,8 +109,8 @@ class SparseSymmetrizeTest
     rmm::device_scalar<value_idx> sum(stream);
     sum.set_value_to_zero_async(stream);
 
-    assert_symmetry<<<raft::ceildiv(out.nnz, 256), 256, 0, stream>>>(
-      out.rows(), out.cols(), out.vals(), (value_idx)out.nnz, sum.data());
+    assert_symmetry<<<raft::ceildiv(out.nnz, (nnz_t)256), 256, 0, stream>>>(
+      out.rows(), out.cols(), out.vals(), (nnz_t)out.nnz, sum.data());
 
     sum_h = sum.value(stream);
     resource::sync_stream(handle, stream);
@@ -211,7 +211,7 @@ const std::vector<SparseSymmetrizeInputs<int, float>> symm_inputs_fint = {
 
 };
 
-typedef SparseSymmetrizeTest<int, float> SparseSymmetrizeTestF_int;
+typedef SparseSymmetrizeTest<int, float, uint64_t> SparseSymmetrizeTestF_int;
 TEST_P(SparseSymmetrizeTestF_int, Result) { ASSERT_TRUE(sum_h == 0); }
 
 INSTANTIATE_TEST_CASE_P(SparseSymmetrizeTest,

From aed76860e5e92e3de12277745839fcf82480ab41 Mon Sep 17 00:00:00 2001
From: "Corey J. Nolet" <cjnolet@gmail.com>
Date: Wed, 29 Jan 2025 16:54:59 -0500
Subject: [PATCH 12/25] More updates

---
 .../raft/sparse/linalg/detail/symmetrize.cuh  | 12 +++++-----
 cpp/include/raft/sparse/linalg/symmetrize.cuh | 10 ++++----
 cpp/include/raft/sparse/op/detail/reduce.cuh  | 24 +++++++++----------
 cpp/include/raft/sparse/op/reduce.cuh         | 10 ++++----
 cpp/tests/sparse/reduce.cu                    | 12 +++++-----
 cpp/tests/sparse/solver/lanczos.cu            | 14 +++++------
 6 files changed, 41 insertions(+), 41 deletions(-)

diff --git a/cpp/include/raft/sparse/linalg/detail/symmetrize.cuh b/cpp/include/raft/sparse/linalg/detail/symmetrize.cuh
index e60db71bdb..3b76f0cc4f 100644
--- a/cpp/include/raft/sparse/linalg/detail/symmetrize.cuh
+++ b/cpp/include/raft/sparse/linalg/detail/symmetrize.cuh
@@ -324,15 +324,15 @@ void from_knn_symmetrize_matrix(const value_idx* __restrict__ knn_indices,
 /**
  * Symmetrizes a COO matrix
  */
-template <typename value_idx, typename value_t>
+template <typename value_idx, typename value_t, typename nnz_t>
 void symmetrize(raft::resources const& handle,
                 const value_idx* rows,
                 const value_idx* cols,
                 const value_t* vals,
-                size_t m,
-                size_t n,
-                size_t nnz,
-                raft::sparse::COO<value_t, value_idx>& out)
+                value_idx m,
+                value_idx n,
+                nnz_t nnz,
+                raft::sparse::COO<value_t, value_idx, nnz_t>& out)
 {
   auto stream = resource::get_cuda_stream(handle);
 
@@ -352,7 +352,7 @@ void symmetrize(raft::resources const& handle,
   // sort COO
   raft::sparse::op::coo_sort((value_idx)m,
                              (value_idx)n,
-                             (value_idx)nnz * 2,
+                             (nnz_t)nnz * 2,
                              symm_rows.data(),
                              symm_cols.data(),
                              symm_vals.data(),
diff --git a/cpp/include/raft/sparse/linalg/symmetrize.cuh b/cpp/include/raft/sparse/linalg/symmetrize.cuh
index 8ee53cd3ae..64bab11233 100644
--- a/cpp/include/raft/sparse/linalg/symmetrize.cuh
+++ b/cpp/include/raft/sparse/linalg/symmetrize.cuh
@@ -148,15 +148,15 @@ void from_knn_symmetrize_matrix(const value_idx* __restrict__ knn_indices,
 /**
  * Symmetrizes a COO matrix
  */
-template <typename value_idx, typename value_t>
+template <typename value_idx, typename value_t, typename nnz_t>
 void symmetrize(raft::resources const& handle,
                 const value_idx* rows,
                 const value_idx* cols,
                 const value_t* vals,
-                size_t m,
-                size_t n,
-                size_t nnz,
-                raft::sparse::COO<value_t, value_idx>& out)
+                value_idx m,
+                value_idx n,
+                nnz_t nnz,
+                raft::sparse::COO<value_t, value_idx, nnz_t>& out)
 {
   detail::symmetrize(handle, rows, cols, vals, m, n, nnz, out);
 }
diff --git a/cpp/include/raft/sparse/op/detail/reduce.cuh b/cpp/include/raft/sparse/op/detail/reduce.cuh
index 1e5dd87958..2359628b78 100644
--- a/cpp/include/raft/sparse/op/detail/reduce.cuh
+++ b/cpp/include/raft/sparse/op/detail/reduce.cuh
@@ -44,13 +44,13 @@ namespace sparse {
 namespace op {
 namespace detail {
 
-template <typename value_idx>
+template <typename value_idx, typename nnz_t>
 RAFT_KERNEL compute_duplicates_diffs_kernel(const value_idx* rows,
                                             const value_idx* cols,
                                             value_idx* diff,
-                                            size_t nnz)
+                                            nnz_t nnz)
 {
-  size_t tid = blockDim.x * blockIdx.x + threadIdx.x;
+  nnz_t tid = blockDim.x * blockIdx.x + threadIdx.x;
   if (tid >= nnz) return;
 
   value_idx d = 1;
@@ -98,13 +98,13 @@ RAFT_KERNEL max_duplicates_kernel(const value_idx* src_rows,
  * @param[in] nnz number of nonzeros in input arrays
  * @param[in] stream cuda ops will be ordered wrt this stream
  */
-template <typename value_idx>
+template <typename value_idx, typename nnz_t>
 void compute_duplicates_mask(
-  value_idx* mask, const value_idx* rows, const value_idx* cols, size_t nnz, cudaStream_t stream)
+  value_idx* mask, const value_idx* rows, const value_idx* cols, nnz_t nnz, cudaStream_t stream)
 {
   RAFT_CUDA_TRY(cudaMemsetAsync(mask, 0, nnz * sizeof(value_idx), stream));
 
-  compute_duplicates_diffs_kernel<<<raft::ceildiv(nnz, (size_t)256), 256, 0, stream>>>(
+  compute_duplicates_diffs_kernel<<<raft::ceildiv(nnz, (nnz_t)256), 256, 0, stream>>>(
     rows, cols, mask, nnz);
 }
 
@@ -124,15 +124,15 @@ void compute_duplicates_mask(
  * @param[in] n number of columns in COO input matrix
  * @param[in] stream cuda ops will be ordered wrt this stream
  */
-template <typename value_idx, typename value_t>
+template <typename value_idx, typename value_t, typename nnz_t>
 void max_duplicates(raft::resources const& handle,
-                    raft::sparse::COO<value_t, value_idx>& out,
+                    raft::sparse::COO<value_t, value_idx, nnz_t>& out,
                     const value_idx* rows,
                     const value_idx* cols,
                     const value_t* vals,
-                    size_t nnz,
-                    size_t m,
-                    size_t n)
+                    nnz_t nnz,
+                    value_idx m,
+                    value_idx n)
 {
   auto stream        = resource::get_cuda_stream(handle);
   auto thrust_policy = resource::get_thrust_policy(handle);
@@ -153,7 +153,7 @@ void max_duplicates(raft::resources const& handle,
   out.allocate(size, m, n, true, stream);
 
   // perform reduce
-  max_duplicates_kernel<<<raft::ceildiv(nnz, (size_t)256), 256, 0, stream>>>(
+  max_duplicates_kernel<<<raft::ceildiv(nnz, (nnz_t)256), 256, 0, stream>>>(
     rows, cols, vals, diff.data() + 1, out.rows(), out.cols(), out.vals(), nnz);
 }
 
diff --git a/cpp/include/raft/sparse/op/reduce.cuh b/cpp/include/raft/sparse/op/reduce.cuh
index b03192f111..102e864943 100644
--- a/cpp/include/raft/sparse/op/reduce.cuh
+++ b/cpp/include/raft/sparse/op/reduce.cuh
@@ -68,15 +68,15 @@ void compute_duplicates_mask(
  * @param[in] m number of rows in COO input matrix
  * @param[in] n number of columns in COO input matrix
  */
-template <typename value_idx, typename value_t>
+template <typename value_idx, typename value_t, typename nnz_t>
 void max_duplicates(raft::resources const& handle,
-                    raft::sparse::COO<value_t, value_idx>& out,
+                    raft::sparse::COO<value_t, value_idx, nnz_t>& out,
                     const value_idx* rows,
                     const value_idx* cols,
                     const value_t* vals,
-                    size_t nnz,
-                    size_t m,
-                    size_t n)
+                    nnz_t nnz,
+                    value_idx m,
+                    value_idx n)
 {
   detail::max_duplicates(handle, out, rows, cols, vals, nnz, m, n);
 }
diff --git a/cpp/tests/sparse/reduce.cu b/cpp/tests/sparse/reduce.cu
index f777f4781d..eef54f1ebe 100644
--- a/cpp/tests/sparse/reduce.cu
+++ b/cpp/tests/sparse/reduce.cu
@@ -41,8 +41,8 @@ struct SparseReduceInputs {
   std::vector<value_idx> out_cols;
   std::vector<value_t> out_vals;
 
-  size_t m;
-  size_t n;
+  value_idx m;
+  value_idx n;
 };
 
 template <typename value_t, typename value_idx>
@@ -73,15 +73,15 @@ class SparseReduceTest : public ::testing::TestWithParam<SparseReduceInputs<valu
     raft::update_device(out_cols.data(), params.out_cols.data(), params.out_cols.size(), stream);
     raft::update_device(out_vals.data(), params.out_vals.data(), params.out_vals.size(), stream);
 
-    raft::sparse::COO<value_t, value_idx> out(stream);
+    raft::sparse::COO<value_t, value_idx, value_idx> out(stream);
     raft::sparse::op::max_duplicates(handle,
                                      out,
                                      in_rows.data(),
                                      in_cols.data(),
                                      in_vals.data(),
-                                     params.in_rows.size(),
-                                     params.m,
-                                     params.n);
+                                     (value_idx)params.in_rows.size(),
+                                     (value_idx)params.m,
+                                     (value_idx)params.n);
     RAFT_CUDA_TRY(cudaStreamSynchronize(stream));
     ASSERT_TRUE(raft::devArrMatch<value_idx>(
       out_rows.data(), out.rows(), out.nnz, raft::Compare<value_idx>()));
diff --git a/cpp/tests/sparse/solver/lanczos.cu b/cpp/tests/sparse/solver/lanczos.cu
index 88bc208f0b..1a009bf8dc 100644
--- a/cpp/tests/sparse/solver/lanczos.cu
+++ b/cpp/tests/sparse/solver/lanczos.cu
@@ -147,7 +147,7 @@ class rmat_lanczos_tests
     raft::device_vector<ValueType, uint32_t, raft::row_major> out_data =
       raft::make_device_vector<ValueType, uint32_t, raft::row_major>(handle, n_edges);
     raft::matrix::fill<ValueType>(handle, out_data.view(), 1.0);
-    raft::sparse::COO<ValueType, IndexType> coo(stream);
+    raft::sparse::COO<ValueType, IndexType, IndexType> coo(stream);
 
     raft::sparse::op::coo_sort<ValueType, int>(n_nodes,
                                                n_nodes,
@@ -161,11 +161,11 @@ class rmat_lanczos_tests
                                                            out_src.data_handle(),
                                                            out_dst.data_handle(),
                                                            out_data.data_handle(),
-                                                           n_edges,
-                                                           n_nodes,
-                                                           n_nodes);
+                                                           (IndexType)n_edges,
+                                                           (IndexType)n_nodes,
+                                                           (IndexType)n_nodes);
 
-    raft::sparse::COO<ValueType, IndexType> symmetric_coo(stream);
+    raft::sparse::COO<ValueType, IndexType, IndexType> symmetric_coo(stream);
     raft::sparse::linalg::symmetrize(
       handle, coo.rows(), coo.cols(), coo.vals(), coo.n_rows, coo.n_cols, coo.nnz, symmetric_coo);
 
@@ -192,7 +192,7 @@ class rmat_lanczos_tests
       raft::make_device_matrix<ValueType, uint32_t, raft::col_major>(
         handle, symmetric_coo.n_rows, n_components);
 
-    raft::spectral::matrix::sparse_matrix_t<IndexType, ValueType, uint64_t> const csr_m{
+    raft::spectral::matrix::sparse_matrix_t<IndexType, ValueType> const csr_m{
       handle,
       row_indices.data_handle(),
       symmetric_coo.cols(),
@@ -203,7 +203,7 @@ class rmat_lanczos_tests
       n_components, params.maxiter, params.restartiter, params.tol, rng.seed};
 
     auto csr_structure =
-      raft::make_device_compressed_structure_view<IndexType, IndexType, uint64_t>(
+      raft::make_device_compressed_structure_view<IndexType, IndexType, IndexType>(
         const_cast<IndexType*>(row_indices.data_handle()),
         const_cast<IndexType*>(symmetric_coo.cols()),
         symmetric_coo.n_rows,

From a58ffcbbdca639ce574aedb28a282a734a5a9c43 Mon Sep 17 00:00:00 2001
From: viclafargue <viclafargue@nvidia.com>
Date: Thu, 30 Jan 2025 12:19:48 +0000
Subject: [PATCH 13/25] completing change

---
 cpp/include/raft/sparse/linalg/degree.cuh          |  4 ++--
 .../sparse/neighbors/detail/cross_component_nn.cuh | 14 ++++++++++----
 .../raft/sparse/neighbors/detail/knn_graph.cuh     |  6 +++---
 .../raft/spectral/detail/matrix_wrappers.hpp       |  2 +-
 cpp/tests/linalg/eigen_solvers.cu                  |  2 +-
 cpp/tests/sparse/solver/lanczos.cu                 |  2 +-
 cpp/tests/sparse/spectral_matrix.cu                |  2 +-
 7 files changed, 19 insertions(+), 13 deletions(-)

diff --git a/cpp/include/raft/sparse/linalg/degree.cuh b/cpp/include/raft/sparse/linalg/degree.cuh
index 965c923993..567eb187c1 100644
--- a/cpp/include/raft/sparse/linalg/degree.cuh
+++ b/cpp/include/raft/sparse/linalg/degree.cuh
@@ -50,7 +50,7 @@ void coo_degree(const T* rows, nnz_type nnz, outT* results, cudaStream_t stream)
 template <typename T, typename outT>
 void coo_degree(COO<T>* in, outT* results, cudaStream_t stream)
 {
-  coo_degree(in->rows(), in->safe_nnz, results, stream);
+  coo_degree(in->rows(), in->nnz, results, stream);
 }
 
 /**
@@ -83,7 +83,7 @@ void coo_degree_scalar(
 template <typename T, typename outT>
 void coo_degree_scalar(COO<T>* in, T scalar, outT* results, cudaStream_t stream)
 {
-  coo_degree_scalar(in->rows(), in->vals(), in->safe_nnz, scalar, results, stream);
+  coo_degree_scalar(in->rows(), in->vals(), in->nnz, scalar, results, stream);
 }
 
 /**
diff --git a/cpp/include/raft/sparse/neighbors/detail/cross_component_nn.cuh b/cpp/include/raft/sparse/neighbors/detail/cross_component_nn.cuh
index a47d5a6f34..3fea5cb330 100644
--- a/cpp/include/raft/sparse/neighbors/detail/cross_component_nn.cuh
+++ b/cpp/include/raft/sparse/neighbors/detail/cross_component_nn.cuh
@@ -448,10 +448,10 @@ void min_components_by_color(raft::sparse::COO<value_t, value_idx>& coo,
  * is done
  * @param[in] metric distance metric
  */
-template <typename value_idx, typename value_t, typename red_op>
+template <typename value_idx, typename value_t, typename nnz_t, typename red_op>
 void cross_component_nn(
   raft::resources const& handle,
-  raft::sparse::COO<value_t, value_idx>& out,
+  raft::sparse::COO<value_t, value_idx, nnz_t>& out,
   const value_t* X,
   const value_idx* orig_colors,
   size_t n_rows,
@@ -534,8 +534,14 @@ void cross_component_nn(
   /**
    * Symmetrize resulting edge list
    */
-  raft::sparse::linalg::symmetrize(
-    handle, min_edges.rows(), min_edges.cols(), min_edges.vals(), n_rows, n_rows, size, out);
+  raft::sparse::linalg::symmetrize(handle,
+                                   min_edges.rows(),
+                                   min_edges.cols(),
+                                   min_edges.vals(),
+                                   (value_idx)n_rows,
+                                   (value_idx)n_rows,
+                                   (nnz_t)size,
+                                   out);
 }
 
 };  // end namespace raft::sparse::neighbors::detail
diff --git a/cpp/include/raft/sparse/neighbors/detail/knn_graph.cuh b/cpp/include/raft/sparse/neighbors/detail/knn_graph.cuh
index 4e46904c83..287722dfff 100644
--- a/cpp/include/raft/sparse/neighbors/detail/knn_graph.cuh
+++ b/cpp/include/raft/sparse/neighbors/detail/knn_graph.cuh
@@ -92,13 +92,13 @@ void conv_indices(in_t* inds, out_t* out, size_t size, cudaStream_t stream)
  * @param[out] out output edge list
  * @param c
  */
-template <typename value_idx = int, typename value_t = float>
+template <typename value_idx = int, typename value_t = float, typename nnz_t>
 void knn_graph(raft::resources const& handle,
                const value_t* X,
                size_t m,
                size_t n,
                raft::distance::DistanceType metric,
-               raft::sparse::COO<value_t, value_idx>& out,
+               raft::sparse::COO<value_t, value_idx, nnz_t>& out,
                int c = 15)
 {
   size_t k = build_k(m, c);
@@ -142,7 +142,7 @@ void knn_graph(raft::resources const& handle,
   conv_indices(int64_indices.data(), indices.data(), nnz, stream);
 
   raft::sparse::linalg::symmetrize(
-    handle, rows.data(), indices.data(), data.data(), m, k, nnz, out);
+    handle, rows.data(), indices.data(), data.data(), (value_idx)m, (value_idx)k, (nnz_t)nnz, out);
 }
 
 };  // namespace raft::sparse::neighbors::detail
diff --git a/cpp/include/raft/spectral/detail/matrix_wrappers.hpp b/cpp/include/raft/spectral/detail/matrix_wrappers.hpp
index bab5b4515a..81abac7312 100644
--- a/cpp/include/raft/spectral/detail/matrix_wrappers.hpp
+++ b/cpp/include/raft/spectral/detail/matrix_wrappers.hpp
@@ -134,7 +134,7 @@ class vector_t {
   const thrust_exec_policy_t thrust_policy;
 };
 
-template <typename index_type, typename value_type, typename nnz_type = index_type>
+template <typename index_type, typename value_type, typename nnz_type = uint64_t>
 struct sparse_matrix_t {
   sparse_matrix_t(resources const& raft_handle,
                   index_type const* row_offsets,
diff --git a/cpp/tests/linalg/eigen_solvers.cu b/cpp/tests/linalg/eigen_solvers.cu
index cf75ff89bf..a7eec5561c 100644
--- a/cpp/tests/linalg/eigen_solvers.cu
+++ b/cpp/tests/linalg/eigen_solvers.cu
@@ -46,7 +46,7 @@ TEST(Raft, EigenSolvers)
   index_type nnz   = 0;
   index_type nrows = 0;
 
-  sparse_matrix_t<index_type, value_type> sm1{h, ro, ci, vs, nrows, nnz};
+  sparse_matrix_t<index_type, value_type> sm1{h, ro, ci, vs, nrows, (uint64_t)nnz};
   ASSERT_EQ(nullptr, sm1.row_offsets_);
 
   index_type neigvs{10};
diff --git a/cpp/tests/sparse/solver/lanczos.cu b/cpp/tests/sparse/solver/lanczos.cu
index 1a009bf8dc..3652b811e9 100644
--- a/cpp/tests/sparse/solver/lanczos.cu
+++ b/cpp/tests/sparse/solver/lanczos.cu
@@ -198,7 +198,7 @@ class rmat_lanczos_tests
       symmetric_coo.cols(),
       symmetric_coo.vals(),
       symmetric_coo.n_rows,
-      symmetric_coo.nnz};
+      (uint64_t)symmetric_coo.nnz};
     raft::sparse::solver::lanczos_solver_config<ValueType> config{
       n_components, params.maxiter, params.restartiter, params.tol, rng.seed};
 
diff --git a/cpp/tests/sparse/spectral_matrix.cu b/cpp/tests/sparse/spectral_matrix.cu
index 52f7eff10e..c177ec236e 100644
--- a/cpp/tests/sparse/spectral_matrix.cu
+++ b/cpp/tests/sparse/spectral_matrix.cu
@@ -55,7 +55,7 @@ TEST(Raft, SpectralMatrices)
   value_type* vs{nullptr};
   index_type nnz   = 0;
   index_type nrows = 0;
-  sparse_matrix_t<index_type, value_type> sm1{h, ro, ci, vs, nrows, nnz};
+  sparse_matrix_t<index_type, value_type> sm1{h, ro, ci, vs, nrows, (uint64_t)nnz};
   sparse_matrix_t<index_type, value_type> sm2{h, csr_v};
   ASSERT_EQ(nullptr, sm1.row_offsets_);
   ASSERT_EQ(nullptr, sm2.row_offsets_);

From 1d10f7228e014b4be87f61fd12a971f0c192608c Mon Sep 17 00:00:00 2001
From: viclafargue <viclafargue@nvidia.com>
Date: Thu, 30 Jan 2025 14:49:43 +0000
Subject: [PATCH 14/25] fixing issues

---
 cpp/include/raft/sparse/linalg/degree.cuh          | 2 +-
 cpp/include/raft/sparse/linalg/detail/spectral.cuh | 4 ++--
 cpp/include/raft/sparse/op/detail/filter.cuh       | 6 +++---
 cpp/include/raft/sparse/op/filter.cuh              | 2 +-
 4 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/cpp/include/raft/sparse/linalg/degree.cuh b/cpp/include/raft/sparse/linalg/degree.cuh
index 567eb187c1..9ce7321de0 100644
--- a/cpp/include/raft/sparse/linalg/degree.cuh
+++ b/cpp/include/raft/sparse/linalg/degree.cuh
@@ -36,7 +36,7 @@ namespace linalg {
 template <typename T = int, typename nnz_type, typename outT>
 void coo_degree(const T* rows, nnz_type nnz, outT* results, cudaStream_t stream)
 {
-  detail::coo_degree<64, T>(rows, (nnz_type)nnz, results, stream);
+  detail::coo_degree<64, T>(rows, nnz, results, stream);
 }
 
 /**
diff --git a/cpp/include/raft/sparse/linalg/detail/spectral.cuh b/cpp/include/raft/sparse/linalg/detail/spectral.cuh
index 103030cbad..87dfa045fe 100644
--- a/cpp/include/raft/sparse/linalg/detail/spectral.cuh
+++ b/cpp/include/raft/sparse/linalg/detail/spectral.cuh
@@ -49,7 +49,7 @@ void fit_embedding(raft::resources const& handle,
                       rows,
                       cols,
                       vals,
-                      static_cast<int>(nnz),
+                      static_cast<uint64_t>(nnz),
                       static_cast<int>(n),
                       src_offsets.data(),
                       dst_cols.data(),
@@ -72,7 +72,7 @@ void fit_embedding(raft::resources const& handle,
   value_type* vs = dst_vals.data();
 
   raft::spectral::matrix::sparse_matrix_t<index_type, value_type> const r_csr_m{
-    handle, ro, ci, vs, static_cast<int>(n), nnz};
+    handle, ro, ci, vs, static_cast<index_type>(n), static_cast<uint64_t>(nnz)};
 
   index_type neigvs       = n_components + 1;
   index_type maxiter      = 4000;  // default reset value (when set to 0);
diff --git a/cpp/include/raft/sparse/op/detail/filter.cuh b/cpp/include/raft/sparse/op/detail/filter.cuh
index e0d072e8bb..c5c095462d 100644
--- a/cpp/include/raft/sparse/op/detail/filter.cuh
+++ b/cpp/include/raft/sparse/op/detail/filter.cuh
@@ -153,8 +153,8 @@ void coo_remove_scalar(COO<T, idx_t, nnz_t>* in,
   rmm::device_uvector<nnz_t> row_count_nz(in->n_rows, stream);
   rmm::device_uvector<nnz_t> row_count(in->n_rows, stream);
 
-  RAFT_CUDA_TRY(cudaMemsetAsync(row_count_nz.data(), 0, (idx_t)in->n_rows * sizeof(idx_t), stream));
-  RAFT_CUDA_TRY(cudaMemsetAsync(row_count.data(), 0, (idx_t)in->n_rows * sizeof(idx_t), stream));
+  RAFT_CUDA_TRY(cudaMemsetAsync(row_count_nz.data(), 0, (nnz_t)in->n_rows * sizeof(nnz_t), stream));
+  RAFT_CUDA_TRY(cudaMemsetAsync(row_count.data(), 0, (nnz_t)in->n_rows * sizeof(nnz_t), stream));
 
   linalg::coo_degree(in->rows(), in->nnz, row_count.data(), stream);
   RAFT_CUDA_TRY(cudaPeekAtLastError());
@@ -164,7 +164,7 @@ void coo_remove_scalar(COO<T, idx_t, nnz_t>* in,
   RAFT_CUDA_TRY(cudaPeekAtLastError());
 
   thrust::device_ptr<nnz_t> d_row_count_nz = thrust::device_pointer_cast(row_count_nz.data());
-  uint64_t out_nnz =
+  nnz_t out_nnz =
     thrust::reduce(rmm::exec_policy(stream), d_row_count_nz, d_row_count_nz + in->n_rows);
 
   out->allocate(out_nnz, in->n_rows, in->n_cols, false, stream);
diff --git a/cpp/include/raft/sparse/op/filter.cuh b/cpp/include/raft/sparse/op/filter.cuh
index 5541a375d2..c257585c0e 100644
--- a/cpp/include/raft/sparse/op/filter.cuh
+++ b/cpp/include/raft/sparse/op/filter.cuh
@@ -46,7 +46,7 @@ template <typename T, typename idx_t, typename nnz_t>
 void coo_remove_scalar(const idx_t* rows,
                        const idx_t* cols,
                        const T* vals,
-                       int nnz,
+                       nnz_t nnz,
                        idx_t* crows,
                        idx_t* ccols,
                        T* cvals,

From 2d6f2dcab9bee7c66c1b9610a3b0fa0d72bc077a Mon Sep 17 00:00:00 2001
From: divyegala <divyegala@gmail.com>
Date: Thu, 30 Jan 2025 20:59:14 +0000
Subject: [PATCH 15/25] some updates

---
 cpp/include/raft/sparse/neighbors/detail/knn_graph.cuh | 10 +++++-----
 cpp/include/raft/sparse/neighbors/knn_graph.cuh        |  8 ++++----
 2 files changed, 9 insertions(+), 9 deletions(-)

diff --git a/cpp/include/raft/sparse/neighbors/detail/knn_graph.cuh b/cpp/include/raft/sparse/neighbors/detail/knn_graph.cuh
index 4e46904c83..88391a8935 100644
--- a/cpp/include/raft/sparse/neighbors/detail/knn_graph.cuh
+++ b/cpp/include/raft/sparse/neighbors/detail/knn_graph.cuh
@@ -92,20 +92,20 @@ void conv_indices(in_t* inds, out_t* out, size_t size, cudaStream_t stream)
  * @param[out] out output edge list
  * @param c
  */
-template <typename value_idx = int, typename value_t = float>
+template <typename value_idx = int, typename value_t = float, typename nnz_t = size_t>
 void knn_graph(raft::resources const& handle,
                const value_t* X,
-               size_t m,
-               size_t n,
+               value_idx m,
+               value_idx n,
                raft::distance::DistanceType metric,
-               raft::sparse::COO<value_t, value_idx>& out,
+               raft::sparse::COO<value_t, value_idx, nnz_t>& out,
                int c = 15)
 {
   size_t k = build_k(m, c);
 
   auto stream = resource::get_cuda_stream(handle);
 
-  size_t nnz = m * k;
+  nnz_t nnz = m * k;
 
   rmm::device_uvector<value_idx> rows(nnz, stream);
   rmm::device_uvector<value_idx> indices(nnz, stream);
diff --git a/cpp/include/raft/sparse/neighbors/knn_graph.cuh b/cpp/include/raft/sparse/neighbors/knn_graph.cuh
index 8257afc16f..6f318e7991 100644
--- a/cpp/include/raft/sparse/neighbors/knn_graph.cuh
+++ b/cpp/include/raft/sparse/neighbors/knn_graph.cuh
@@ -40,13 +40,13 @@ namespace raft::sparse::neighbors {
  * @param[out] out output edge list
  * @param c
  */
-template <typename value_idx = int, typename value_t = float>
+template <typename value_idx = int, typename value_t = float, typename nnz_t = size_t>
 void knn_graph(raft::resources const& handle,
                const value_t* X,
-               std::size_t m,
-               std::size_t n,
+               value_idx m,
+               value_idx n,
                raft::distance::DistanceType metric,
-               raft::sparse::COO<value_t, value_idx>& out,
+               raft::sparse::COO<value_t, value_idx, nnz_t>& out,
                int c = 15)
 {
   detail::knn_graph(handle, X, m, n, metric, out, c);

From cbae3155c0ee9d10a26f4eb0724419602e0b198d Mon Sep 17 00:00:00 2001
From: divyegala <divyegala@gmail.com>
Date: Thu, 30 Jan 2025 22:59:53 +0000
Subject: [PATCH 16/25] working through updates for cuvs

---
 cpp/include/raft/linalg/detail/lanczos.cuh    | 16 ++++----
 .../sparse/neighbors/detail/knn_graph.cuh     |  2 +-
 .../raft/spectral/detail/matrix_wrappers.hpp  | 40 +++++++++----------
 3 files changed, 29 insertions(+), 29 deletions(-)

diff --git a/cpp/include/raft/linalg/detail/lanczos.cuh b/cpp/include/raft/linalg/detail/lanczos.cuh
index d4d3325f77..33b9faa27d 100644
--- a/cpp/include/raft/linalg/detail/lanczos.cuh
+++ b/cpp/include/raft/linalg/detail/lanczos.cuh
@@ -745,10 +745,10 @@ static int lanczosRestart(raft::resources const& handle,
  *  @param seed random seed.
  *  @return error flag.
  */
-template <typename index_type_t, typename value_type_t>
+template <typename index_type_t, typename value_type_t, typename nnz_type_t>
 int computeSmallestEigenvectors(
   raft::resources const& handle,
-  spectral::matrix::sparse_matrix_t<index_type_t, value_type_t> const* A,
+  spectral::matrix::sparse_matrix_t<index_type_t, value_type_t, nnz_type_t> const* A,
   index_type_t nEigVecs,
   index_type_t maxIter,
   index_type_t restartIter,
@@ -986,10 +986,10 @@ int computeSmallestEigenvectors(
   return 0;
 }
 
-template <typename index_type_t, typename value_type_t>
+template <typename index_type_t, typename value_type_t, typename nnz_type_t>
 int computeSmallestEigenvectors(
   raft::resources const& handle,
-  spectral::matrix::sparse_matrix_t<index_type_t, value_type_t> const& A,
+  spectral::matrix::sparse_matrix_t<index_type_t, value_type_t, nnz_type_t> const& A,
   index_type_t nEigVecs,
   index_type_t maxIter,
   index_type_t restartIter,
@@ -1090,10 +1090,10 @@ int computeSmallestEigenvectors(
  *  @param seed random seed.
  *  @return error flag.
  */
-template <typename index_type_t, typename value_type_t>
+template <typename index_type_t, typename value_type_t, typename nnz_type_t>
 int computeLargestEigenvectors(
   raft::resources const& handle,
-  spectral::matrix::sparse_matrix_t<index_type_t, value_type_t> const* A,
+  spectral::matrix::sparse_matrix_t<index_type_t, value_type_t, nnz_type_t> const* A,
   index_type_t nEigVecs,
   index_type_t maxIter,
   index_type_t restartIter,
@@ -1334,10 +1334,10 @@ int computeLargestEigenvectors(
   return 0;
 }
 
-template <typename index_type_t, typename value_type_t>
+template <typename index_type_t, typename value_type_t, typename nnz_type_t>
 int computeLargestEigenvectors(
   raft::resources const& handle,
-  spectral::matrix::sparse_matrix_t<index_type_t, value_type_t> const& A,
+  spectral::matrix::sparse_matrix_t<index_type_t, value_type_t, nnz_type_t> const& A,
   index_type_t nEigVecs,
   index_type_t maxIter,
   index_type_t restartIter,
diff --git a/cpp/include/raft/sparse/neighbors/detail/knn_graph.cuh b/cpp/include/raft/sparse/neighbors/detail/knn_graph.cuh
index 435875f813..88391a8935 100644
--- a/cpp/include/raft/sparse/neighbors/detail/knn_graph.cuh
+++ b/cpp/include/raft/sparse/neighbors/detail/knn_graph.cuh
@@ -142,7 +142,7 @@ void knn_graph(raft::resources const& handle,
   conv_indices(int64_indices.data(), indices.data(), nnz, stream);
 
   raft::sparse::linalg::symmetrize(
-    handle, rows.data(), indices.data(), data.data(), (value_idx)m, (value_idx)k, (nnz_t)nnz, out);
+    handle, rows.data(), indices.data(), data.data(), m, k, nnz, out);
 }
 
 };  // namespace raft::sparse::neighbors::detail
diff --git a/cpp/include/raft/spectral/detail/matrix_wrappers.hpp b/cpp/include/raft/spectral/detail/matrix_wrappers.hpp
index 81abac7312..f7aa507059 100644
--- a/cpp/include/raft/spectral/detail/matrix_wrappers.hpp
+++ b/cpp/include/raft/spectral/detail/matrix_wrappers.hpp
@@ -314,15 +314,15 @@ struct sparse_matrix_t {
   nnz_type const nnz_;
 };
 
-template <typename index_type, typename value_type>
-struct laplacian_matrix_t : sparse_matrix_t<index_type, value_type> {
+template <typename index_type, typename value_type, typename nnz_type = uint64_t>
+struct laplacian_matrix_t : sparse_matrix_t<index_type, value_type, nnz_type> {
   laplacian_matrix_t(resources const& raft_handle,
                      index_type const* row_offsets,
                      index_type const* col_indices,
                      value_type const* values,
                      index_type const nrows,
-                     index_type const nnz)
-    : sparse_matrix_t<index_type, value_type>(
+                     nnz_type const nnz)
+    : sparse_matrix_t<index_type, value_type, nnz_type>(
         raft_handle, row_offsets, col_indices, values, nrows, nnz),
       diagonal_(raft_handle, nrows)
   {
@@ -332,8 +332,8 @@ struct laplacian_matrix_t : sparse_matrix_t<index_type, value_type> {
   }
 
   laplacian_matrix_t(resources const& raft_handle,
-                     sparse_matrix_t<index_type, value_type> const& csr_m)
-    : sparse_matrix_t<index_type, value_type>(raft_handle,
+                     sparse_matrix_t<index_type, value_type, nnz_type> const& csr_m)
+    : sparse_matrix_t<index_type, value_type, nnz_type>(raft_handle,
                                               csr_m.row_offsets_,
                                               csr_m.col_indices_,
                                               csr_m.values_,
@@ -343,7 +343,7 @@ struct laplacian_matrix_t : sparse_matrix_t<index_type, value_type> {
   {
     vector_t<value_type> ones{raft_handle, (size_t)csr_m.nrows_};
     ones.fill(1.0);
-    sparse_matrix_t<index_type, value_type>::mv(1, ones.raw(), 0, diagonal_.raw());
+    sparse_matrix_t<index_type, value_type, nnz_type>::mv(1, ones.raw(), 0, diagonal_.raw());
   }
 
   // y = alpha*A*x + beta*y
@@ -357,9 +357,9 @@ struct laplacian_matrix_t : sparse_matrix_t<index_type, value_type> {
           bool symmetric      = false) const override
   {
     constexpr int BLOCK_SIZE = 1024;
-    auto n                   = sparse_matrix_t<index_type, value_type>::nrows_;
+    auto n                   = sparse_matrix_t<index_type, value_type, nnz_type>::nrows_;
 
-    auto handle   = sparse_matrix_t<index_type, value_type>::get_handle();
+    auto handle   = sparse_matrix_t<index_type, value_type, nnz_type>::get_handle();
     auto cublas_h = resource::get_cublas_handle(handle);
     auto stream   = resource::get_cuda_stream(handle);
 
@@ -382,31 +382,31 @@ struct laplacian_matrix_t : sparse_matrix_t<index_type, value_type> {
 
     // Apply adjacency matrix
     //
-    sparse_matrix_t<index_type, value_type>::mv(-alpha, x, 1, y, alg, transpose, symmetric);
+    sparse_matrix_t<index_type, value_type, nnz_type>::mv(-alpha, x, 1, y, alg, transpose, symmetric);
   }
 
   vector_t<value_type> diagonal_;
 };
 
-template <typename index_type, typename value_type>
-struct modularity_matrix_t : laplacian_matrix_t<index_type, value_type> {
+template <typename index_type, typename value_type, typename nnz_type = uint64_t>
+struct modularity_matrix_t : laplacian_matrix_t<index_type, value_type, nnz_type> {
   modularity_matrix_t(resources const& raft_handle,
                       index_type const* row_offsets,
                       index_type const* col_indices,
                       value_type const* values,
                       index_type const nrows,
-                      index_type const nnz)
-    : laplacian_matrix_t<index_type, value_type>(
+                      nnz_type const nnz)
+    : laplacian_matrix_t<index_type, value_type, nnz_type>(
         raft_handle, row_offsets, col_indices, values, nrows, nnz)
   {
-    edge_sum_ = laplacian_matrix_t<index_type, value_type>::diagonal_.nrm1();
+    edge_sum_ = laplacian_matrix_t<index_type, value_type, nnz_type>::diagonal_.nrm1();
   }
 
   modularity_matrix_t(resources const& raft_handle,
                       sparse_matrix_t<index_type, value_type> const& csr_m)
-    : laplacian_matrix_t<index_type, value_type>(raft_handle, csr_m)
+    : laplacian_matrix_t<index_type, value_type, nnz_type>(raft_handle, csr_m)
   {
-    edge_sum_ = laplacian_matrix_t<index_type, value_type>::diagonal_.nrm1();
+    edge_sum_ = laplacian_matrix_t<index_type, value_type, nnz_type>::diagonal_.nrm1();
   }
 
   // y = alpha*A*x + beta*y
@@ -427,7 +427,7 @@ struct modularity_matrix_t : laplacian_matrix_t<index_type, value_type> {
 
     // y = A*x
     //
-    sparse_matrix_t<index_type, value_type>::mv(alpha, x, 0, y, alg, transpose, symmetric);
+    sparse_matrix_t<index_type, value_type, nnz_type>::mv(alpha, x, 0, y, alg, transpose, symmetric);
     value_type dot_res;
 
     // gamma = d'*x
@@ -437,7 +437,7 @@ struct modularity_matrix_t : laplacian_matrix_t<index_type, value_type> {
     RAFT_CUBLAS_TRY(
       raft::linalg::detail::cublasdot(cublas_h,
                                       n,
-                                      laplacian_matrix_t<index_type, value_type>::diagonal_.raw(),
+                                      laplacian_matrix_t<index_type, value_type, nnz_type>::diagonal_.raw(),
                                       1,
                                       x,
                                       1,
@@ -452,7 +452,7 @@ struct modularity_matrix_t : laplacian_matrix_t<index_type, value_type> {
       raft::linalg::detail::cublasaxpy(cublas_h,
                                        n,
                                        &gamma_,
-                                       laplacian_matrix_t<index_type, value_type>::diagonal_.raw(),
+                                       laplacian_matrix_t<index_type, value_type, nnz_type>::diagonal_.raw(),
                                        1,
                                        y,
                                        1,

From 485d0429264b07908a9d5c1a04670a7933abef96 Mon Sep 17 00:00:00 2001
From: divyegala <divyegala@gmail.com>
Date: Thu, 30 Jan 2025 23:02:39 +0000
Subject: [PATCH 17/25] working through updates for cuvs

---
 .../raft/spectral/detail/matrix_wrappers.hpp  | 52 ++++++++++---------
 1 file changed, 27 insertions(+), 25 deletions(-)

diff --git a/cpp/include/raft/spectral/detail/matrix_wrappers.hpp b/cpp/include/raft/spectral/detail/matrix_wrappers.hpp
index f7aa507059..6ac0f9c837 100644
--- a/cpp/include/raft/spectral/detail/matrix_wrappers.hpp
+++ b/cpp/include/raft/spectral/detail/matrix_wrappers.hpp
@@ -334,11 +334,11 @@ struct laplacian_matrix_t : sparse_matrix_t<index_type, value_type, nnz_type> {
   laplacian_matrix_t(resources const& raft_handle,
                      sparse_matrix_t<index_type, value_type, nnz_type> const& csr_m)
     : sparse_matrix_t<index_type, value_type, nnz_type>(raft_handle,
-                                              csr_m.row_offsets_,
-                                              csr_m.col_indices_,
-                                              csr_m.values_,
-                                              csr_m.nrows_,
-                                              csr_m.nnz_),
+                                                        csr_m.row_offsets_,
+                                                        csr_m.col_indices_,
+                                                        csr_m.values_,
+                                                        csr_m.nrows_,
+                                                        csr_m.nnz_),
       diagonal_(raft_handle, csr_m.nrows_)
   {
     vector_t<value_type> ones{raft_handle, (size_t)csr_m.nrows_};
@@ -382,7 +382,8 @@ struct laplacian_matrix_t : sparse_matrix_t<index_type, value_type, nnz_type> {
 
     // Apply adjacency matrix
     //
-    sparse_matrix_t<index_type, value_type, nnz_type>::mv(-alpha, x, 1, y, alg, transpose, symmetric);
+    sparse_matrix_t<index_type, value_type, nnz_type>::mv(
+      -alpha, x, 1, y, alg, transpose, symmetric);
   }
 
   vector_t<value_type> diagonal_;
@@ -427,36 +428,37 @@ struct modularity_matrix_t : laplacian_matrix_t<index_type, value_type, nnz_type
 
     // y = A*x
     //
-    sparse_matrix_t<index_type, value_type, nnz_type>::mv(alpha, x, 0, y, alg, transpose, symmetric);
+    sparse_matrix_t<index_type, value_type, nnz_type>::mv(
+      alpha, x, 0, y, alg, transpose, symmetric);
     value_type dot_res;
 
     // gamma = d'*x
     //
     // Cublas::dot(this->n, D.raw(), 1, x, 1, &dot_res);
     // TODO: Call from public API when ready
-    RAFT_CUBLAS_TRY(
-      raft::linalg::detail::cublasdot(cublas_h,
-                                      n,
-                                      laplacian_matrix_t<index_type, value_type, nnz_type>::diagonal_.raw(),
-                                      1,
-                                      x,
-                                      1,
-                                      &dot_res,
-                                      stream));
+    RAFT_CUBLAS_TRY(raft::linalg::detail::cublasdot(
+      cublas_h,
+      n,
+      laplacian_matrix_t<index_type, value_type, nnz_type>::diagonal_.raw(),
+      1,
+      x,
+      1,
+      &dot_res,
+      stream));
 
     // y = y -(gamma/edge_sum)*d
     //
     value_type gamma_ = -dot_res / edge_sum_;
     // TODO: Call from public API when ready
-    RAFT_CUBLAS_TRY(
-      raft::linalg::detail::cublasaxpy(cublas_h,
-                                       n,
-                                       &gamma_,
-                                       laplacian_matrix_t<index_type, value_type, nnz_type>::diagonal_.raw(),
-                                       1,
-                                       y,
-                                       1,
-                                       stream));
+    RAFT_CUBLAS_TRY(raft::linalg::detail::cublasaxpy(
+      cublas_h,
+      n,
+      &gamma_,
+      laplacian_matrix_t<index_type, value_type, nnz_type>::diagonal_.raw(),
+      1,
+      y,
+      1,
+      stream));
   }
 
   value_type edge_sum_;

From d377a2cff30ad80180d54535d49894ccc6d9eced Mon Sep 17 00:00:00 2001
From: divyegala <divyegala@gmail.com>
Date: Fri, 31 Jan 2025 00:12:49 +0000
Subject: [PATCH 18/25] lanczos tests updates

---
 .../raft/sparse/solver/detail/lanczos.cuh     | 16 +++++++-------
 cpp/include/raft/sparse/solver/lanczos.cuh    |  8 +++----
 cpp/tests/sparse/spectral_matrix.cu           | 21 ++++++++++++-------
 3 files changed, 25 insertions(+), 20 deletions(-)

diff --git a/cpp/include/raft/sparse/solver/detail/lanczos.cuh b/cpp/include/raft/sparse/solver/detail/lanczos.cuh
index c9749624e1..3d1c52427c 100644
--- a/cpp/include/raft/sparse/solver/detail/lanczos.cuh
+++ b/cpp/include/raft/sparse/solver/detail/lanczos.cuh
@@ -789,10 +789,10 @@ static int lanczosRestart(raft::resources const& handle,
  *  @param seed random seed.
  *  @return error flag.
  */
-template <typename index_type_t, typename value_type_t>
+template <typename index_type_t, typename value_type_t, typename nnz_type_t>
 int computeSmallestEigenvectors(
   raft::resources const& handle,
-  spectral::matrix::sparse_matrix_t<index_type_t, value_type_t> const* A,
+  spectral::matrix::sparse_matrix_t<index_type_t, value_type_t, nnz_type_t> const* A,
   index_type_t nEigVecs,
   index_type_t maxIter,
   index_type_t restartIter,
@@ -1034,10 +1034,10 @@ int computeSmallestEigenvectors(
   return 0;
 }
 
-template <typename index_type_t, typename value_type_t>
+template <typename index_type_t, typename value_type_t, typename nnz_type_t>
 int computeSmallestEigenvectors(
   raft::resources const& handle,
-  spectral::matrix::sparse_matrix_t<index_type_t, value_type_t> const& A,
+  spectral::matrix::sparse_matrix_t<index_type_t, value_type_t, nnz_type_t> const& A,
   index_type_t nEigVecs,
   index_type_t maxIter,
   index_type_t restartIter,
@@ -1137,10 +1137,10 @@ int computeSmallestEigenvectors(
  *  @param seed random seed.
  *  @return error flag.
  */
-template <typename index_type_t, typename value_type_t>
+template <typename index_type_t, typename value_type_t, typename nnz_type_t>
 int computeLargestEigenvectors(
   raft::resources const& handle,
-  spectral::matrix::sparse_matrix_t<index_type_t, value_type_t> const* A,
+  spectral::matrix::sparse_matrix_t<index_type_t, value_type_t, nnz_type_t> const* A,
   index_type_t nEigVecs,
   index_type_t maxIter,
   index_type_t restartIter,
@@ -1385,10 +1385,10 @@ int computeLargestEigenvectors(
   return 0;
 }
 
-template <typename index_type_t, typename value_type_t>
+template <typename index_type_t, typename value_type_t, typename nnz_type_t>
 int computeLargestEigenvectors(
   raft::resources const& handle,
-  spectral::matrix::sparse_matrix_t<index_type_t, value_type_t> const& A,
+  spectral::matrix::sparse_matrix_t<index_type_t, value_type_t, nnz_type_t> const& A,
   index_type_t nEigVecs,
   index_type_t maxIter,
   index_type_t restartIter,
diff --git a/cpp/include/raft/sparse/solver/lanczos.cuh b/cpp/include/raft/sparse/solver/lanczos.cuh
index 4c45a28cc6..0617dc71da 100644
--- a/cpp/include/raft/sparse/solver/lanczos.cuh
+++ b/cpp/include/raft/sparse/solver/lanczos.cuh
@@ -137,10 +137,10 @@ auto lanczos_compute_smallest_eigenvectors(
  *  @param seed random seed.
  *  @return error flag.
  */
-template <typename index_type_t, typename value_type_t>
+template <typename index_type_t, typename value_type_t, typename nnz_type_t>
 int computeSmallestEigenvectors(
   raft::resources const& handle,
-  raft::spectral::matrix::sparse_matrix_t<index_type_t, value_type_t> const& A,
+  raft::spectral::matrix::sparse_matrix_t<index_type_t, value_type_t, nnz_type_t> const& A,
   index_type_t nEigVecs,
   index_type_t maxIter,
   index_type_t restartIter,
@@ -201,10 +201,10 @@ int computeSmallestEigenvectors(
  *  @param seed random seed.
  *  @return error flag.
  */
-template <typename index_type_t, typename value_type_t>
+template <typename index_type_t, typename value_type_t, typename nnz_type_t>
 int computeLargestEigenvectors(
   raft::resources const& handle,
-  raft::spectral::matrix::sparse_matrix_t<index_type_t, value_type_t> const& A,
+  raft::spectral::matrix::sparse_matrix_t<index_type_t, value_type_t, nnz_type_t> const& A,
   index_type_t nEigVecs,
   index_type_t maxIter,
   index_type_t restartIter,
diff --git a/cpp/tests/sparse/spectral_matrix.cu b/cpp/tests/sparse/spectral_matrix.cu
index c177ec236e..beb93bb425 100644
--- a/cpp/tests/sparse/spectral_matrix.cu
+++ b/cpp/tests/sparse/spectral_matrix.cu
@@ -41,8 +41,9 @@ TEST(Raft, SpectralMatrices)
 {
   using index_type = int;
   using value_type = double;
+  using nnz_type   = uint64_t
 
-  raft::resources h;
+    raft::resources h;
   ASSERT_EQ(0, raft::resource::get_device_id(h));
 
   csr_view_t<index_type, value_type> csr_v{nullptr, nullptr, nullptr, 0, 0};
@@ -53,29 +54,33 @@ TEST(Raft, SpectralMatrices)
   index_type* ro{nullptr};
   index_type* ci{nullptr};
   value_type* vs{nullptr};
-  index_type nnz   = 0;
+  nnz_type nnz     = 0;
   index_type nrows = 0;
-  sparse_matrix_t<index_type, value_type> sm1{h, ro, ci, vs, nrows, (uint64_t)nnz};
-  sparse_matrix_t<index_type, value_type> sm2{h, csr_v};
+  sparse_matrix_t<index_type, value_type, nnz_type> sm1{h, ro, ci, vs, nrows, nnz};
+  sparse_matrix_t<index_type, value_type, nnz_type> sm2{h, csr_v};
   ASSERT_EQ(nullptr, sm1.row_offsets_);
   ASSERT_EQ(nullptr, sm2.row_offsets_);
 
   auto stream = resource::get_cuda_stream(h);
 
   auto cnstr_lm1 = [&h, ro, ci, vs, nrows, nnz](void) {
-    laplacian_matrix_t<index_type, value_type> lm1{h, ro, ci, vs, nrows, nnz};
+    laplacian_matrix_t<index_type, value_type, nnz_type> lm1{h, ro, ci, vs, nrows, nnz};
   };
   EXPECT_ANY_THROW(cnstr_lm1());  // because of nullptr ptr args
 
-  auto cnstr_lm2 = [&h, &sm2](void) { laplacian_matrix_t<index_type, value_type> lm2{h, sm2}; };
+  auto cnstr_lm2 = [&h, &sm2](void) {
+    laplacian_matrix_t<index_type, value_type, nnz_type> lm2{h, sm2};
+  };
   EXPECT_ANY_THROW(cnstr_lm2());  // because of nullptr ptr args
 
   auto cnstr_mm1 = [&h, ro, ci, vs, nrows, nnz](void) {
-    modularity_matrix_t<index_type, value_type> mm1{h, ro, ci, vs, nrows, nnz};
+    modularity_matrix_t<index_type, value_type, nnz_type> mm1{h, ro, ci, vs, nrows, nnz};
   };
   EXPECT_ANY_THROW(cnstr_mm1());  // because of nullptr ptr args
 
-  auto cnstr_mm2 = [&h, &sm2](void) { modularity_matrix_t<index_type, value_type> mm2{h, sm2}; };
+  auto cnstr_mm2 = [&h, &sm2](void) {
+    modularity_matrix_t<index_type, value_type, nnz_type> mm2{h, sm2};
+  };
   EXPECT_ANY_THROW(cnstr_mm2());  // because of nullptr ptr args
 }
 

From 46db162c227c5b1ef525d36263b98e826cdcf339 Mon Sep 17 00:00:00 2001
From: divyegala <divyegala@gmail.com>
Date: Fri, 31 Jan 2025 00:16:50 +0000
Subject: [PATCH 19/25] missing ;

---
 cpp/tests/sparse/spectral_matrix.cu | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/cpp/tests/sparse/spectral_matrix.cu b/cpp/tests/sparse/spectral_matrix.cu
index beb93bb425..6d52dfb1bb 100644
--- a/cpp/tests/sparse/spectral_matrix.cu
+++ b/cpp/tests/sparse/spectral_matrix.cu
@@ -41,9 +41,9 @@ TEST(Raft, SpectralMatrices)
 {
   using index_type = int;
   using value_type = double;
-  using nnz_type   = uint64_t
+  using nnz_type   = uint64_t;
 
-    raft::resources h;
+  raft::resources h;
   ASSERT_EQ(0, raft::resource::get_device_id(h));
 
   csr_view_t<index_type, value_type> csr_v{nullptr, nullptr, nullptr, 0, 0};

From dd39780ca0a49617def0d29138ab325ee986103d Mon Sep 17 00:00:00 2001
From: divyegala <divyegala@gmail.com>
Date: Fri, 31 Jan 2025 16:19:44 -0800
Subject: [PATCH 20/25] tons of updates to lanczos/eigen

---
 .../sparse/neighbors/detail/knn_graph.cuh     |   2 +-
 .../raft/sparse/solver/detail/lanczos.cuh     | 135 +++++++++---------
 .../raft/spectral/detail/matrix_wrappers.hpp  |   6 +-
 .../raft/spectral/detail/partition.hpp        |  16 ++-
 .../raft/spectral/detail/spectral_util.cuh    |  21 +--
 cpp/include/raft/spectral/eigen_solvers.cuh   |   4 +-
 cpp/include/raft/spectral/partition.cuh       |  16 ++-
 cpp/tests/linalg/eigen_solvers.cu             |  10 +-
 8 files changed, 111 insertions(+), 99 deletions(-)

diff --git a/cpp/include/raft/sparse/neighbors/detail/knn_graph.cuh b/cpp/include/raft/sparse/neighbors/detail/knn_graph.cuh
index 88391a8935..ba007c6bb1 100644
--- a/cpp/include/raft/sparse/neighbors/detail/knn_graph.cuh
+++ b/cpp/include/raft/sparse/neighbors/detail/knn_graph.cuh
@@ -142,7 +142,7 @@ void knn_graph(raft::resources const& handle,
   conv_indices(int64_indices.data(), indices.data(), nnz, stream);
 
   raft::sparse::linalg::symmetrize(
-    handle, rows.data(), indices.data(), data.data(), m, k, nnz, out);
+    handle, rows.data(), indices.data(), data.data(), m, static_cast<int>(k), nnz, out);
 }
 
 };  // namespace raft::sparse::neighbors::detail
diff --git a/cpp/include/raft/sparse/solver/detail/lanczos.cuh b/cpp/include/raft/sparse/solver/detail/lanczos.cuh
index 3d1c52427c..1088211276 100644
--- a/cpp/include/raft/sparse/solver/detail/lanczos.cuh
+++ b/cpp/include/raft/sparse/solver/detail/lanczos.cuh
@@ -123,18 +123,19 @@ inline curandStatus_t curandGenerateNormalX(
  *    Workspace. Not needed if full reorthogonalization is disabled.
  *  @return Zero if successful. Otherwise non-zero.
  */
-template <typename index_type_t, typename value_type_t>
-int performLanczosIteration(raft::resources const& handle,
-                            spectral::matrix::sparse_matrix_t<index_type_t, value_type_t> const* A,
-                            index_type_t* iter,
-                            index_type_t maxIter,
-                            value_type_t shift,
-                            value_type_t tol,
-                            bool reorthogonalize,
-                            value_type_t* __restrict__ alpha_host,
-                            value_type_t* __restrict__ beta_host,
-                            value_type_t* __restrict__ lanczosVecs_dev,
-                            value_type_t* __restrict__ work_dev)
+template <typename index_type_t, typename value_type_t, typename nnz_type_t>
+int performLanczosIteration(
+  raft::resources const& handle,
+  spectral::matrix::sparse_matrix_t<index_type_t, value_type_t, nnz_type_t> const* A,
+  index_type_t* iter,
+  index_type_t maxIter,
+  value_type_t shift,
+  value_type_t tol,
+  bool reorthogonalize,
+  value_type_t* __restrict__ alpha_host,
+  value_type_t* __restrict__ beta_host,
+  value_type_t* __restrict__ lanczosVecs_dev,
+  value_type_t* __restrict__ work_dev)
 {
   // -------------------------------------------------------
   // Variable declaration
@@ -888,17 +889,17 @@ int computeSmallestEigenvectors(
   // Obtain tridiagonal matrix with Lanczos
   *effIter = 0;
   *shift   = 0;
-  status   = performLanczosIteration<index_type_t, value_type_t>(handle,
-                                                               A,
-                                                               effIter,
-                                                               maxIter_curr,
-                                                               *shift,
-                                                               0.0,
-                                                               reorthogonalize,
-                                                               alpha_host,
-                                                               beta_host,
-                                                               lanczosVecs_dev,
-                                                               work_dev);
+  status   = performLanczosIteration<index_type_t, value_type_t, nnz_type_t>(handle,
+                                                                           A,
+                                                                           effIter,
+                                                                           maxIter_curr,
+                                                                           *shift,
+                                                                           0.0,
+                                                                           reorthogonalize,
+                                                                           alpha_host,
+                                                                           beta_host,
+                                                                           lanczosVecs_dev,
+                                                                           work_dev);
   if (status) WARNING("error in Lanczos iteration");
 
   // Determine largest eigenvalue
@@ -913,17 +914,17 @@ int computeSmallestEigenvectors(
   // Obtain tridiagonal matrix with Lanczos
   *effIter = 0;
 
-  status = performLanczosIteration<index_type_t, value_type_t>(handle,
-                                                               A,
-                                                               effIter,
-                                                               maxIter_curr,
-                                                               *shift,
-                                                               0,
-                                                               reorthogonalize,
-                                                               alpha_host,
-                                                               beta_host,
-                                                               lanczosVecs_dev,
-                                                               work_dev);
+  status = performLanczosIteration<index_type_t, value_type_t, nnz_type_t>(handle,
+                                                                           A,
+                                                                           effIter,
+                                                                           maxIter_curr,
+                                                                           *shift,
+                                                                           0,
+                                                                           reorthogonalize,
+                                                                           alpha_host,
+                                                                           beta_host,
+                                                                           lanczosVecs_dev,
+                                                                           work_dev);
   if (status) WARNING("error in Lanczos iteration");
   *totalIter += *effIter;
 
@@ -961,17 +962,17 @@ int computeSmallestEigenvectors(
 
     // Proceed with Lanczos method
 
-    status = performLanczosIteration<index_type_t, value_type_t>(handle,
-                                                                 A,
-                                                                 effIter,
-                                                                 maxIter_curr,
-                                                                 *shift,
-                                                                 tol * fabs(shiftLower),
-                                                                 reorthogonalize,
-                                                                 alpha_host,
-                                                                 beta_host,
-                                                                 lanczosVecs_dev,
-                                                                 work_dev);
+    status = performLanczosIteration<index_type_t, value_type_t, nnz_type_t>(handle,
+                                                                             A,
+                                                                             effIter,
+                                                                             maxIter_curr,
+                                                                             *shift,
+                                                                             tol * fabs(shiftLower),
+                                                                             reorthogonalize,
+                                                                             alpha_host,
+                                                                             beta_host,
+                                                                             lanczosVecs_dev,
+                                                                             work_dev);
     if (status) WARNING("error in Lanczos iteration");
     *totalIter += *effIter - iter_new;
   }
@@ -1236,17 +1237,17 @@ int computeLargestEigenvectors(
   value_type_t shift_val = 0.0;
   value_type_t* shift    = &shift_val;
 
-  status = performLanczosIteration<index_type_t, value_type_t>(handle,
-                                                               A,
-                                                               effIter,
-                                                               maxIter_curr,
-                                                               *shift,
-                                                               0,
-                                                               reorthogonalize,
-                                                               alpha_host,
-                                                               beta_host,
-                                                               lanczosVecs_dev,
-                                                               work_dev);
+  status = performLanczosIteration<index_type_t, value_type_t, nnz_type_t>(handle,
+                                                                           A,
+                                                                           effIter,
+                                                                           maxIter_curr,
+                                                                           *shift,
+                                                                           0,
+                                                                           reorthogonalize,
+                                                                           alpha_host,
+                                                                           beta_host,
+                                                                           lanczosVecs_dev,
+                                                                           work_dev);
   if (status) WARNING("error in Lanczos iteration");
   *totalIter += *effIter;
 
@@ -1284,17 +1285,17 @@ int computeLargestEigenvectors(
 
     // Proceed with Lanczos method
 
-    status = performLanczosIteration<index_type_t, value_type_t>(handle,
-                                                                 A,
-                                                                 effIter,
-                                                                 maxIter_curr,
-                                                                 *shift,
-                                                                 tol * fabs(shiftLower),
-                                                                 reorthogonalize,
-                                                                 alpha_host,
-                                                                 beta_host,
-                                                                 lanczosVecs_dev,
-                                                                 work_dev);
+    status = performLanczosIteration<index_type_t, value_type_t, nnz_type_t>(handle,
+                                                                             A,
+                                                                             effIter,
+                                                                             maxIter_curr,
+                                                                             *shift,
+                                                                             tol * fabs(shiftLower),
+                                                                             reorthogonalize,
+                                                                             alpha_host,
+                                                                             beta_host,
+                                                                             lanczosVecs_dev,
+                                                                             work_dev);
     if (status) WARNING("error in Lanczos iteration");
     *totalIter += *effIter - iter_new;
   }
diff --git a/cpp/include/raft/spectral/detail/matrix_wrappers.hpp b/cpp/include/raft/spectral/detail/matrix_wrappers.hpp
index 6ac0f9c837..af4a838bd5 100644
--- a/cpp/include/raft/spectral/detail/matrix_wrappers.hpp
+++ b/cpp/include/raft/spectral/detail/matrix_wrappers.hpp
@@ -404,7 +404,7 @@ struct modularity_matrix_t : laplacian_matrix_t<index_type, value_type, nnz_type
   }
 
   modularity_matrix_t(resources const& raft_handle,
-                      sparse_matrix_t<index_type, value_type> const& csr_m)
+                      sparse_matrix_t<index_type, value_type, nnz_type> const& csr_m)
     : laplacian_matrix_t<index_type, value_type, nnz_type>(raft_handle, csr_m)
   {
     edge_sum_ = laplacian_matrix_t<index_type, value_type, nnz_type>::diagonal_.nrm1();
@@ -420,9 +420,9 @@ struct modularity_matrix_t : laplacian_matrix_t<index_type, value_type, nnz_type
           bool transpose      = false,
           bool symmetric      = false) const override
   {
-    auto n = sparse_matrix_t<index_type, value_type>::nrows_;
+    auto n = sparse_matrix_t<index_type, value_type, nnz_type>::nrows_;
 
-    auto handle   = sparse_matrix_t<index_type, value_type>::get_handle();
+    auto handle   = sparse_matrix_t<index_type, value_type, nnz_type>::get_handle();
     auto cublas_h = resource::get_cublas_handle(handle);
     auto stream   = resource::get_cuda_stream(handle);
 
diff --git a/cpp/include/raft/spectral/detail/partition.hpp b/cpp/include/raft/spectral/detail/partition.hpp
index f5fc40aad6..26e4a73f9d 100644
--- a/cpp/include/raft/spectral/detail/partition.hpp
+++ b/cpp/include/raft/spectral/detail/partition.hpp
@@ -63,10 +63,14 @@ namespace detail {
  *    performed.
  *  @return statistics: number of eigensolver iterations, .
  */
-template <typename vertex_t, typename weight_t, typename EigenSolver, typename ClusterSolver>
+template <typename vertex_t,
+          typename weight_t,
+          typename nnz_t,
+          typename EigenSolver,
+          typename ClusterSolver>
 std::tuple<vertex_t, weight_t, vertex_t> partition(
   raft::resources const& handle,
-  spectral::matrix::sparse_matrix_t<vertex_t, weight_t> const& csr_m,
+  spectral::matrix::sparse_matrix_t<vertex_t, weight_t, nnz_t> const& csr_m,
   EigenSolver const& eigen_solver,
   ClusterSolver const& cluster_solver,
   vertex_t* __restrict__ clusters,
@@ -94,7 +98,7 @@ std::tuple<vertex_t, weight_t, vertex_t> partition(
 
   // Initialize Laplacian
   /// sparse_matrix_t<vertex_t, weight_t> A{handle, graph};
-  spectral::matrix::laplacian_matrix_t<vertex_t, weight_t> L{handle, csr_m};
+  spectral::matrix::laplacian_matrix_t<vertex_t, weight_t, nnz_t> L{handle, csr_m};
 
   auto eigen_config = eigen_solver.get_config();
   auto nEigVecs     = eigen_config.n_eigVecs;
@@ -132,9 +136,9 @@ std::tuple<vertex_t, weight_t, vertex_t> partition(
  *  @param cost On exit, partition cost function.
  *  @return error flag.
  */
-template <typename vertex_t, typename weight_t>
+template <typename vertex_t, typename weight_t, typename nnz_t>
 void analyzePartition(raft::resources const& handle,
-                      spectral::matrix::sparse_matrix_t<vertex_t, weight_t> const& csr_m,
+                      spectral::matrix::sparse_matrix_t<vertex_t, weight_t, nnz_t> const& csr_m,
                       vertex_t nClusters,
                       const vertex_t* __restrict__ clusters,
                       weight_t& edgeCut,
@@ -160,7 +164,7 @@ void analyzePartition(raft::resources const& handle,
 
   // Initialize Laplacian
   /// sparse_matrix_t<vertex_t, weight_t> A{handle, graph};
-  spectral::matrix::laplacian_matrix_t<vertex_t, weight_t> L{handle, csr_m};
+  spectral::matrix::laplacian_matrix_t<vertex_t, weight_t, nnz_t> L{handle, csr_m};
 
   // Initialize output
   cost    = 0;
diff --git a/cpp/include/raft/spectral/detail/spectral_util.cuh b/cpp/include/raft/spectral/detail/spectral_util.cuh
index 002fad9680..9bbc8878fe 100644
--- a/cpp/include/raft/spectral/detail/spectral_util.cuh
+++ b/cpp/include/raft/spectral/detail/spectral_util.cuh
@@ -133,16 +133,17 @@ struct equal_to_i_op {
 
 // Construct indicator vector for ith partition
 //
-template <typename vertex_t, typename edge_t, typename weight_t>
-bool construct_indicator(raft::resources const& handle,
-                         edge_t index,
-                         edge_t n,
-                         weight_t& clustersize,
-                         weight_t& partStats,
-                         vertex_t const* __restrict__ clusters,
-                         raft::spectral::matrix::vector_t<weight_t>& part_i,
-                         raft::spectral::matrix::vector_t<weight_t>& Bx,
-                         raft::spectral::matrix::laplacian_matrix_t<vertex_t, weight_t> const& B)
+template <typename vertex_t, typename edge_t, typename weight_t, typename nnz_t>
+bool construct_indicator(
+  raft::resources const& handle,
+  edge_t index,
+  edge_t n,
+  weight_t& clustersize,
+  weight_t& partStats,
+  vertex_t const* __restrict__ clusters,
+  raft::spectral::matrix::vector_t<weight_t>& part_i,
+  raft::spectral::matrix::vector_t<weight_t>& Bx,
+  raft::spectral::matrix::laplacian_matrix_t<vertex_t, weight_t, nnz_t> const& B)
 {
   auto stream             = resource::get_cuda_stream(handle);
   auto cublas_h           = resource::get_cublas_handle(handle);
diff --git a/cpp/include/raft/spectral/eigen_solvers.cuh b/cpp/include/raft/spectral/eigen_solvers.cuh
index 324f16ac7b..03448e2b5e 100644
--- a/cpp/include/raft/spectral/eigen_solvers.cuh
+++ b/cpp/include/raft/spectral/eigen_solvers.cuh
@@ -51,7 +51,7 @@ struct lanczos_solver_t {
 
   index_type_t solve_smallest_eigenvectors(
     raft::resources const& handle,
-    matrix::sparse_matrix_t<index_type_t, value_type_t> const& A,
+    matrix::sparse_matrix_t<index_type_t, value_type_t, size_type_t> const& A,
     value_type_t* __restrict__ eigVals,
     value_type_t* __restrict__ eigVecs) const
   {
@@ -75,7 +75,7 @@ struct lanczos_solver_t {
 
   index_type_t solve_largest_eigenvectors(
     raft::resources const& handle,
-    matrix::sparse_matrix_t<index_type_t, value_type_t> const& A,
+    matrix::sparse_matrix_t<index_type_t, value_type_t, size_type_t> const& A,
     value_type_t* __restrict__ eigVals,
     value_type_t* __restrict__ eigVecs) const
   {
diff --git a/cpp/include/raft/spectral/partition.cuh b/cpp/include/raft/spectral/partition.cuh
index a2ac328aa1..319ef0ccd1 100644
--- a/cpp/include/raft/spectral/partition.cuh
+++ b/cpp/include/raft/spectral/partition.cuh
@@ -45,17 +45,21 @@ namespace spectral {
  *  @param eigVecs Output eigenvector array pointer on device
  *  @return statistics: number of eigensolver iterations, .
  */
-template <typename vertex_t, typename weight_t, typename EigenSolver, typename ClusterSolver>
+template <typename vertex_t,
+          typename weight_t,
+          typename nnz_t,
+          typename EigenSolver,
+          typename ClusterSolver>
 std::tuple<vertex_t, weight_t, vertex_t> partition(
   raft::resources const& handle,
-  matrix::sparse_matrix_t<vertex_t, weight_t> const& csr_m,
+  matrix::sparse_matrix_t<vertex_t, weight_t, nnz_t> const& csr_m,
   EigenSolver const& eigen_solver,
   ClusterSolver const& cluster_solver,
   vertex_t* __restrict__ clusters,
   weight_t* eigVals,
   weight_t* eigVecs)
 {
-  return raft::spectral::detail::partition<vertex_t, weight_t, EigenSolver, ClusterSolver>(
+  return raft::spectral::detail::partition<vertex_t, weight_t, nnz_t, EigenSolver, ClusterSolver>(
     handle, csr_m, eigen_solver, cluster_solver, clusters, eigVals, eigVecs);
 }
 
@@ -77,15 +81,15 @@ std::tuple<vertex_t, weight_t, vertex_t> partition(
  *  @param edgeCut On exit, weight of edges cut by partition.
  *  @param cost On exit, partition cost function.
  */
-template <typename vertex_t, typename weight_t>
+template <typename vertex_t, typename weight_t, typename nnz_t>
 void analyzePartition(raft::resources const& handle,
-                      matrix::sparse_matrix_t<vertex_t, weight_t> const& csr_m,
+                      matrix::sparse_matrix_t<vertex_t, weight_t, nnz_t> const& csr_m,
                       vertex_t nClusters,
                       const vertex_t* __restrict__ clusters,
                       weight_t& edgeCut,
                       weight_t& cost)
 {
-  raft::spectral::detail::analyzePartition<vertex_t, weight_t>(
+  raft::spectral::detail::analyzePartition<vertex_t, weight_t, nnz_t>(
     handle, csr_m, nClusters, clusters, edgeCut, cost);
 }
 
diff --git a/cpp/tests/linalg/eigen_solvers.cu b/cpp/tests/linalg/eigen_solvers.cu
index a7eec5561c..250deb6f33 100644
--- a/cpp/tests/linalg/eigen_solvers.cu
+++ b/cpp/tests/linalg/eigen_solvers.cu
@@ -36,6 +36,7 @@ TEST(Raft, EigenSolvers)
   using namespace matrix;
   using index_type = int;
   using value_type = double;
+  using nnz_type   = int;
 
   raft::resources h;
   ASSERT_EQ(0, resource::get_device_id(h));
@@ -46,7 +47,7 @@ TEST(Raft, EigenSolvers)
   index_type nnz   = 0;
   index_type nrows = 0;
 
-  sparse_matrix_t<index_type, value_type> sm1{h, ro, ci, vs, nrows, (uint64_t)nnz};
+  sparse_matrix_t<index_type, value_type, nnz_type> sm1{h, ro, ci, vs, nrows, nnz};
   ASSERT_EQ(nullptr, sm1.row_offsets_);
 
   index_type neigvs{10};
@@ -64,7 +65,7 @@ TEST(Raft, EigenSolvers)
   eigen_solver_config_t<index_type, value_type> cfg{
     neigvs, maxiter, restart_iter, tol, reorthog, seed};
 
-  lanczos_solver_t<index_type, value_type> eig_solver{cfg};
+  lanczos_solver_t<index_type, value_type, nnz_type> eig_solver{cfg};
 
   EXPECT_ANY_THROW(eig_solver.solve_smallest_eigenvectors(h, sm1, eigvals, eigvecs));
 
@@ -77,6 +78,7 @@ TEST(Raft, SpectralSolvers)
   using namespace matrix;
   using index_type = int;
   using value_type = double;
+  using nnz_type   = int;
 
   raft::resources h;
   ASSERT_EQ(0, resource::get_device_id(h)
@@ -99,14 +101,14 @@ TEST(Raft, SpectralSolvers)
 
   eigen_solver_config_t<index_type, value_type> eig_cfg{
     neigvs, maxiter, restart_iter, tol, reorthog, seed};
-  lanczos_solver_t<index_type, value_type> eig_solver{eig_cfg};
+  lanczos_solver_t<index_type, value_type, nnz_type> eig_solver{eig_cfg};
 
   index_type k{5};
 
   cluster_solver_config_t<index_type, value_type> clust_cfg{k, maxiter, tol, seed};
   kmeans_solver_t<index_type, value_type> cluster_solver{clust_cfg};
 
-  sparse_matrix_t<index_type, value_type> sm{h, nullptr, nullptr, nullptr, 0, 0};
+  sparse_matrix_t<index_type, value_type, nnz_type> sm{h, nullptr, nullptr, nullptr, 0, 0};
   EXPECT_ANY_THROW(
     spectral::partition(h, sm, eig_solver, cluster_solver, clusters, eigvals, eigvecs));
 

From f17e921bf038328adb8602bf3e790d05cd115ce9 Mon Sep 17 00:00:00 2001
From: divyegala <divyegala@gmail.com>
Date: Mon, 3 Feb 2025 20:06:48 +0000
Subject: [PATCH 21/25] some fixes for cuml

---
 .../raft/cluster/detail/connectivities.cuh    | 20 +++++++++----------
 cpp/include/raft/cluster/single_linkage.cuh   |  8 ++++----
 2 files changed, 14 insertions(+), 14 deletions(-)

diff --git a/cpp/include/raft/cluster/detail/connectivities.cuh b/cpp/include/raft/cluster/detail/connectivities.cuh
index c527b754c3..86bae07711 100644
--- a/cpp/include/raft/cluster/detail/connectivities.cuh
+++ b/cpp/include/raft/cluster/detail/connectivities.cuh
@@ -43,8 +43,8 @@ template <raft::cluster::LinkageDistance dist_type, typename value_idx, typename
 struct distance_graph_impl {
   void run(raft::resources const& handle,
            const value_t* X,
-           size_t m,
-           size_t n,
+           value_idx m,
+           value_idx n,
            raft::distance::DistanceType metric,
            rmm::device_uvector<value_idx>& indptr,
            rmm::device_uvector<value_idx>& indices,
@@ -61,8 +61,8 @@ template <typename value_idx, typename value_t>
 struct distance_graph_impl<raft::cluster::LinkageDistance::KNN_GRAPH, value_idx, value_t> {
   void run(raft::resources const& handle,
            const value_t* X,
-           size_t m,
-           size_t n,
+           value_idx m,
+           value_idx n,
            raft::distance::DistanceType metric,
            rmm::device_uvector<value_idx>& indptr,
            rmm::device_uvector<value_idx>& indices,
@@ -130,8 +130,8 @@ RAFT_KERNEL fill_indices2(value_idx* indices, size_t m, size_t nnz)
 template <typename value_idx, typename value_t>
 void pairwise_distances(const raft::resources& handle,
                         const value_t* X,
-                        size_t m,
-                        size_t n,
+                        value_idx m,
+                        value_idx n,
                         raft::distance::DistanceType metric,
                         value_idx* indptr,
                         value_idx* indices,
@@ -178,8 +178,8 @@ template <typename value_idx, typename value_t>
 struct distance_graph_impl<raft::cluster::LinkageDistance::PAIRWISE, value_idx, value_t> {
   void run(const raft::resources& handle,
            const value_t* X,
-           size_t m,
-           size_t n,
+           value_idx m,
+           value_idx n,
            raft::distance::DistanceType metric,
            rmm::device_uvector<value_idx>& indptr,
            rmm::device_uvector<value_idx>& indices,
@@ -216,8 +216,8 @@ struct distance_graph_impl<raft::cluster::LinkageDistance::PAIRWISE, value_idx,
 template <typename value_idx, typename value_t, raft::cluster::LinkageDistance dist_type>
 void get_distance_graph(raft::resources const& handle,
                         const value_t* X,
-                        size_t m,
-                        size_t n,
+                        value_idx m,
+                        value_idx n,
                         raft::distance::DistanceType metric,
                         rmm::device_uvector<value_idx>& indptr,
                         rmm::device_uvector<value_idx>& indices,
diff --git a/cpp/include/raft/cluster/single_linkage.cuh b/cpp/include/raft/cluster/single_linkage.cuh
index 067445c542..de56386b96 100644
--- a/cpp/include/raft/cluster/single_linkage.cuh
+++ b/cpp/include/raft/cluster/single_linkage.cuh
@@ -52,8 +52,8 @@ template <typename value_idx,
           LinkageDistance dist_type = LinkageDistance::KNN_GRAPH>
 [[deprecated("Use cuVS instead")]] void single_linkage(raft::resources const& handle,
                                                        const value_t* X,
-                                                       size_t m,
-                                                       size_t n,
+                                                       value_idx m,
+                                                       value_idx n,
                                                        raft::distance::DistanceType metric,
                                                        linkage_output<value_idx>* out,
                                                        int c,
@@ -103,8 +103,8 @@ template <typename value_t, typename idx_t, LinkageDistance dist_type = LinkageD
   raft::cluster::single_linkage<idx_t, value_t, dist_type>(
     handle,
     X.data_handle(),
-    static_cast<std::size_t>(X.extent(0)),
-    static_cast<std::size_t>(X.extent(1)),
+    X.extent(0),
+    X.extent(1),
     metric,
     &out_arrs,
     c.has_value() ? c.value() : DEFAULT_CONST_C,

From 5b30389da28d01d995b7281315076cb98869b531 Mon Sep 17 00:00:00 2001
From: divyegala <divyegala@gmail.com>
Date: Mon, 3 Feb 2025 22:12:33 +0000
Subject: [PATCH 22/25] fixes for cuml

---
 cpp/include/raft/sparse/linalg/detail/spectral.cuh | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/cpp/include/raft/sparse/linalg/detail/spectral.cuh b/cpp/include/raft/sparse/linalg/detail/spectral.cuh
index 87dfa045fe..a2d50f9a77 100644
--- a/cpp/include/raft/sparse/linalg/detail/spectral.cuh
+++ b/cpp/include/raft/sparse/linalg/detail/spectral.cuh
@@ -71,7 +71,7 @@ void fit_embedding(raft::resources const& handle,
   index_type* ci = dst_cols.data();
   value_type* vs = dst_vals.data();
 
-  raft::spectral::matrix::sparse_matrix_t<index_type, value_type> const r_csr_m{
+  raft::spectral::matrix::sparse_matrix_t<index_type, value_type, nnz_t> const r_csr_m{
     handle, ro, ci, vs, static_cast<index_type>(n), static_cast<uint64_t>(nnz)};
 
   index_type neigvs       = n_components + 1;
@@ -79,12 +79,12 @@ void fit_embedding(raft::resources const& handle,
   value_type tol          = 0.01;
   index_type restart_iter = 15 + neigvs;  // what cugraph is using
 
-  raft::spectral::eigen_solver_config_t<index_type, value_type> cfg{
+  raft::spectral::eigen_solver_config_t<index_type, value_type, nnz_t> cfg{
     neigvs, maxiter, restart_iter, tol};
 
   cfg.seed = seed;
 
-  raft::spectral::lanczos_solver_t<index_type, value_type> eig_solver{cfg};
+  raft::spectral::lanczos_solver_t<index_type, value_type, nnz_t> eig_solver{cfg};
 
   // cluster computation here is irrelevant,
   // hence define a no-op such solver to

From a444f900e06e24b5a2aa723be7b95a096723908b Mon Sep 17 00:00:00 2001
From: divyegala <divyegala@gmail.com>
Date: Mon, 3 Feb 2025 23:43:54 +0000
Subject: [PATCH 23/25] more cuml fixes

---
 cpp/include/raft/sparse/linalg/detail/spectral.cuh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cpp/include/raft/sparse/linalg/detail/spectral.cuh b/cpp/include/raft/sparse/linalg/detail/spectral.cuh
index a2d50f9a77..7b8cb545cf 100644
--- a/cpp/include/raft/sparse/linalg/detail/spectral.cuh
+++ b/cpp/include/raft/sparse/linalg/detail/spectral.cuh
@@ -72,7 +72,7 @@ void fit_embedding(raft::resources const& handle,
   value_type* vs = dst_vals.data();
 
   raft::spectral::matrix::sparse_matrix_t<index_type, value_type, nnz_t> const r_csr_m{
-    handle, ro, ci, vs, static_cast<index_type>(n), static_cast<uint64_t>(nnz)};
+    handle, ro, ci, vs, static_cast<index_type>(n), nnz};
 
   index_type neigvs       = n_components + 1;
   index_type maxiter      = 4000;  // default reset value (when set to 0);

From 0ad7bacf4a8f4f57e06ee8061be08fce32f9673e Mon Sep 17 00:00:00 2001
From: viclafargue <viclafargue@nvidia.com>
Date: Tue, 11 Feb 2025 12:17:37 +0000
Subject: [PATCH 24/25] Answering review

---
 cpp/include/raft/linalg/detail/lanczos.cuh           |  2 +-
 cpp/include/raft/sparse/linalg/degree.cuh            |  2 +-
 cpp/include/raft/sparse/linalg/detail/degree.cuh     | 12 ++++++------
 cpp/include/raft/sparse/linalg/detail/symmetrize.cuh | 10 +++++-----
 cpp/include/raft/sparse/op/detail/filter.cuh         |  8 ++++----
 cpp/include/raft/sparse/op/detail/sort.h             |  2 +-
 cpp/include/raft/sparse/solver/detail/lanczos.cuh    | 10 +++++-----
 7 files changed, 23 insertions(+), 23 deletions(-)

diff --git a/cpp/include/raft/linalg/detail/lanczos.cuh b/cpp/include/raft/linalg/detail/lanczos.cuh
index 33b9faa27d..06c3cb1357 100644
--- a/cpp/include/raft/linalg/detail/lanczos.cuh
+++ b/cpp/include/raft/linalg/detail/lanczos.cuh
@@ -1004,7 +1004,7 @@ int computeSmallestEigenvectors(
   index_type_t n = A.nrows_;
 
   // Check that parameters are valid
-  RAFT_EXPECTS(nEigVecs > 0 && static_cast<uint64_t>(nEigVecs) <= n,
+  RAFT_EXPECTS(nEigVecs > 0 && static_cast<nnz_type_t>(nEigVecs) <= n,
                "Invalid number of eigenvectors.");
   RAFT_EXPECTS(restartIter > 0, "Invalid restartIter.");
   RAFT_EXPECTS(tol > 0, "Invalid tolerance.");
diff --git a/cpp/include/raft/sparse/linalg/degree.cuh b/cpp/include/raft/sparse/linalg/degree.cuh
index 9ce7321de0..dde811ee2d 100644
--- a/cpp/include/raft/sparse/linalg/degree.cuh
+++ b/cpp/include/raft/sparse/linalg/degree.cuh
@@ -68,7 +68,7 @@ template <typename T, typename nnz_type, typename outT>
 void coo_degree_scalar(
   const int* rows, const T* vals, nnz_type nnz, T scalar, outT* results, cudaStream_t stream = 0)
 {
-  detail::coo_degree_scalar<64>(rows, vals, (uint64_t)nnz, scalar, results, stream);
+  detail::coo_degree_scalar<64>(rows, vals, nnz, scalar, results, stream);
 }
 
 /**
diff --git a/cpp/include/raft/sparse/linalg/detail/degree.cuh b/cpp/include/raft/sparse/linalg/detail/degree.cuh
index 6a83ddae42..d51188c54c 100644
--- a/cpp/include/raft/sparse/linalg/detail/degree.cuh
+++ b/cpp/include/raft/sparse/linalg/detail/degree.cuh
@@ -39,10 +39,10 @@ namespace detail {
  * @param nnz the size of the rows array
  * @param results array to place results
  */
-template <uint64_t TPB_X = 64, typename T = int, typename outT, typename nnz_t>
+template <int TPB_X = 64, typename T = int, typename outT, typename nnz_t>
 RAFT_KERNEL coo_degree_kernel(const T* rows, nnz_t nnz, outT* results)
 {
-  uint64_t row = (blockIdx.x * TPB_X) + threadIdx.x;
+  nnz_t row = (blockIdx.x * static_cast<nnz_t>(TPB_X)) + threadIdx.x;
   if (row < nnz) { atomicAdd(results + rows[row], (outT)1); }
 }
 
@@ -71,11 +71,11 @@ RAFT_KERNEL coo_degree_nz_kernel(const int* rows, const T* vals, nnz_t nnz, int*
   if (row < nnz && vals[row] != 0.0) { raft::myAtomicAdd(results + rows[row], 1); }
 }
 
-template <uint64_t TPB_X = 64, typename T, typename outT, typename nnz_t>
+template <int TPB_X = 64, typename T, typename outT, typename nnz_t>
 RAFT_KERNEL coo_degree_scalar_kernel(
   const int* rows, const T* vals, nnz_t nnz, T scalar, outT* results)
 {
-  uint64_t row = (blockIdx.x * TPB_X) + threadIdx.x;
+  nnz_t row = (blockIdx.x * static_cast<nnz_t>(TPB_X)) + threadIdx.x;
   if (row < nnz && vals[row] != scalar) { raft::myAtomicAdd((outT*)results + rows[row], (outT)1); }
 }
 
@@ -90,11 +90,11 @@ RAFT_KERNEL coo_degree_scalar_kernel(
  * @param results: output row counts
  * @param stream: cuda stream to use
  */
-template <uint64_t TPB_X = 64, typename T, typename outT, typename nnz_t>
+template <int TPB_X = 64, typename T, typename outT, typename nnz_t>
 void coo_degree_scalar(
   const int* rows, const T* vals, nnz_t nnz, T scalar, outT* results, cudaStream_t stream = 0)
 {
-  dim3 grid_rc(raft::ceildiv(nnz, TPB_X), 1, 1);
+  dim3 grid_rc(raft::ceildiv(nnz, static_cast<nnz_t>(TPB_X)), 1, 1);
   dim3 blk_rc(TPB_X, 1, 1);
   coo_degree_scalar_kernel<TPB_X, T>
     <<<grid_rc, blk_rc, 0, stream>>>(rows, vals, nnz, scalar, results);
diff --git a/cpp/include/raft/sparse/linalg/detail/symmetrize.cuh b/cpp/include/raft/sparse/linalg/detail/symmetrize.cuh
index 3b76f0cc4f..686c601e98 100644
--- a/cpp/include/raft/sparse/linalg/detail/symmetrize.cuh
+++ b/cpp/include/raft/sparse/linalg/detail/symmetrize.cuh
@@ -104,7 +104,7 @@ RAFT_KERNEL coo_symmetrize_kernel(nnz_t* row_ind,
       // Note that if we did find a match, we don't need to
       // compute `res` on it here because it will be computed
       // in a different thread.
-      if (!found_match && cur_val != 0.0) {
+      if (!found_match && vals[idx] != 0.0) {
         orows[out_start_idx + row_nnz] = cur_col;
         ocols[out_start_idx + row_nnz] = cur_row;
         ovals[out_start_idx + row_nnz] = res;
@@ -131,9 +131,9 @@ RAFT_KERNEL coo_symmetrize_kernel(nnz_t* row_ind,
  * @param reduction_op: a custom reduction function
  * @param stream: cuda stream to use
  */
-template <int TPB_X = 128, typename T, typename Lambda>
-void coo_symmetrize(COO<T>* in,
-                    COO<T>* out,
+template <int TPB_X = 128, typename T, typename IdxT, typename nnz_t, typename Lambda>
+void coo_symmetrize(COO<T, IdxT, nnz_t>* in,
+                    COO<T, IdxT, nnz_t>* out,
                     Lambda reduction_op,  // two-argument reducer
                     cudaStream_t stream)
 {
@@ -142,7 +142,7 @@ void coo_symmetrize(COO<T>* in,
 
   ASSERT(!out->validate_mem(), "Expecting unallocated COO for output");
 
-  rmm::device_uvector<uint64_t> in_row_ind(in->n_rows, stream);
+  rmm::device_uvector<nnz_t> in_row_ind(in->n_rows, stream);
 
   convert::sorted_coo_to_csr(in, in_row_ind.data(), stream);
 
diff --git a/cpp/include/raft/sparse/op/detail/filter.cuh b/cpp/include/raft/sparse/op/detail/filter.cuh
index c5c095462d..dede0747a7 100644
--- a/cpp/include/raft/sparse/op/detail/filter.cuh
+++ b/cpp/include/raft/sparse/op/detail/filter.cuh
@@ -104,10 +104,10 @@ void coo_remove_scalar(const idx_t* rows,
                        idx_t n,
                        cudaStream_t stream)
 {
-  rmm::device_uvector<uint64_t> ex_scan(n, stream);
-  rmm::device_uvector<uint64_t> cur_ex_scan(n, stream);
-  RAFT_CUDA_TRY(cudaMemsetAsync(ex_scan.data(), 0, (nnz_t)n * sizeof(uint64_t), stream));
-  RAFT_CUDA_TRY(cudaMemsetAsync(cur_ex_scan.data(), 0, (nnz_t)n * sizeof(uint64_t), stream));
+  rmm::device_uvector<nnz_t> ex_scan(n, stream);
+  rmm::device_uvector<nnz_t> cur_ex_scan(n, stream);
+  RAFT_CUDA_TRY(cudaMemsetAsync(ex_scan.data(), 0, (nnz_t)n * sizeof(nnz_t), stream));
+  RAFT_CUDA_TRY(cudaMemsetAsync(cur_ex_scan.data(), 0, (nnz_t)n * sizeof(nnz_t), stream));
 
   thrust::device_ptr<nnz_t> dev_cnnz    = thrust::device_pointer_cast(cnnz);
   thrust::device_ptr<nnz_t> dev_ex_scan = thrust::device_pointer_cast(ex_scan.data());
diff --git a/cpp/include/raft/sparse/op/detail/sort.h b/cpp/include/raft/sparse/op/detail/sort.h
index 8bd848bedb..2c5337bf0e 100644
--- a/cpp/include/raft/sparse/op/detail/sort.h
+++ b/cpp/include/raft/sparse/op/detail/sort.h
@@ -83,7 +83,7 @@ void coo_sort(IdxT m, IdxT n, nnz_t nnz, IdxT* rows, IdxT* cols, T* vals, cudaSt
  * @param in: COO to sort by row
  * @param stream: the cuda stream to use
  */
-template <typename T, typename IdxT = int, typename nnz_t = uint64_t>
+template <typename T, typename IdxT = int, typename nnz_t>
 void coo_sort(COO<T, IdxT, nnz_t>* const in, cudaStream_t stream)
 {
   coo_sort<T, IdxT, nnz_t>(
diff --git a/cpp/include/raft/sparse/solver/detail/lanczos.cuh b/cpp/include/raft/sparse/solver/detail/lanczos.cuh
index 1088211276..71274333e8 100644
--- a/cpp/include/raft/sparse/solver/detail/lanczos.cuh
+++ b/cpp/include/raft/sparse/solver/detail/lanczos.cuh
@@ -152,7 +152,7 @@ int performLanczosIteration(
 
   RAFT_EXPECTS(A != nullptr, "Null matrix pointer.");
 
-  uint64_t n = A->nrows_;
+  nnz_type_t n = A->nrows_;
 
   // -------------------------------------------------------
   // Compute second Lanczos vector
@@ -815,7 +815,7 @@ int computeSmallestEigenvectors(
   constexpr value_type_t zero = 0;
 
   // Matrix dimension
-  uint64_t n = A->nrows_;
+  nnz_type_t n = A->nrows_;
 
   // Shift for implicit restart
   value_type_t shiftUpper;
@@ -837,7 +837,7 @@ int computeSmallestEigenvectors(
   // -------------------------------------------------------
   // Check that parameters are valid
   // -------------------------------------------------------
-  RAFT_EXPECTS(nEigVecs > 0 && static_cast<uint64_t>(nEigVecs) <= n,
+  RAFT_EXPECTS(nEigVecs > 0 && static_cast<nnz_type_t>(nEigVecs) <= n,
                "Invalid number of eigenvectors.");
   RAFT_EXPECTS(restartIter > 0, "Invalid restartIter.");
   RAFT_EXPECTS(tol > 0, "Invalid tolerance.");
@@ -1162,7 +1162,7 @@ int computeLargestEigenvectors(
   constexpr value_type_t zero = 0;
 
   // Matrix dimension
-  uint64_t n = A->nrows_;
+  nnz_type_t n = A->nrows_;
 
   // Lanczos iteration counters
   index_type_t maxIter_curr = restartIter;  // Maximum size of Lanczos system
@@ -1185,7 +1185,7 @@ int computeLargestEigenvectors(
   // -------------------------------------------------------
   // Check that parameters are valid
   // -------------------------------------------------------
-  RAFT_EXPECTS(nEigVecs > 0 && static_cast<uint64_t>(nEigVecs) <= n,
+  RAFT_EXPECTS(nEigVecs > 0 && static_cast<nnz_type_t>(nEigVecs) <= n,
                "Invalid number of eigenvectors.");
   RAFT_EXPECTS(restartIter > 0, "Invalid restartIter.");
   RAFT_EXPECTS(tol > 0, "Invalid tolerance.");

From f6f7908084819cc1310f3f7cb4d53793778e9f18 Mon Sep 17 00:00:00 2001
From: viclafargue <viclafargue@nvidia.com>
Date: Tue, 11 Feb 2025 12:39:45 +0000
Subject: [PATCH 25/25] Additional changes

---
 cpp/include/raft/sparse/convert/detail/csr.cuh       |  7 ++++---
 cpp/include/raft/sparse/linalg/detail/symmetrize.cuh |  2 +-
 cpp/include/raft/sparse/op/detail/filter.cuh         | 11 +++++++----
 3 files changed, 12 insertions(+), 8 deletions(-)

diff --git a/cpp/include/raft/sparse/convert/detail/csr.cuh b/cpp/include/raft/sparse/convert/detail/csr.cuh
index 44d6f21f9d..64ed1bbeea 100644
--- a/cpp/include/raft/sparse/convert/detail/csr.cuh
+++ b/cpp/include/raft/sparse/convert/detail/csr.cuh
@@ -84,11 +84,12 @@ void coo_to_csr(raft::resources const& handle,
  * @param m: number of rows in dense matrix
  * @param stream: cuda stream to use
  */
-template <typename T, typename outT>
-void sorted_coo_to_csr(const T* rows, uint64_t nnz, outT* row_ind, int m, cudaStream_t stream)
+template <typename T, typename nnz_t, typename outT>
+void sorted_coo_to_csr(const T* rows, nnz_t nnz, outT* row_ind, int m, cudaStream_t stream)
 {
   rmm::device_uvector<outT> row_counts(m, stream);
-  RAFT_CUDA_TRY(cudaMemsetAsync(row_counts.data(), 0, (uint64_t)m * sizeof(outT), stream));
+  RAFT_CUDA_TRY(
+    cudaMemsetAsync(row_counts.data(), 0, static_cast<nnz_t>(m) * sizeof(outT), stream));
 
   linalg::coo_degree(rows, nnz, row_counts.data(), stream);
 
diff --git a/cpp/include/raft/sparse/linalg/detail/symmetrize.cuh b/cpp/include/raft/sparse/linalg/detail/symmetrize.cuh
index 686c601e98..b248de855d 100644
--- a/cpp/include/raft/sparse/linalg/detail/symmetrize.cuh
+++ b/cpp/include/raft/sparse/linalg/detail/symmetrize.cuh
@@ -352,7 +352,7 @@ void symmetrize(raft::resources const& handle,
   // sort COO
   raft::sparse::op::coo_sort((value_idx)m,
                              (value_idx)n,
-                             (nnz_t)nnz * 2,
+                             static_cast<nnz_t>(nnz) * 2,
                              symm_rows.data(),
                              symm_cols.data(),
                              symm_vals.data(),
diff --git a/cpp/include/raft/sparse/op/detail/filter.cuh b/cpp/include/raft/sparse/op/detail/filter.cuh
index dede0747a7..db2e3b858b 100644
--- a/cpp/include/raft/sparse/op/detail/filter.cuh
+++ b/cpp/include/raft/sparse/op/detail/filter.cuh
@@ -106,8 +106,9 @@ void coo_remove_scalar(const idx_t* rows,
 {
   rmm::device_uvector<nnz_t> ex_scan(n, stream);
   rmm::device_uvector<nnz_t> cur_ex_scan(n, stream);
-  RAFT_CUDA_TRY(cudaMemsetAsync(ex_scan.data(), 0, (nnz_t)n * sizeof(nnz_t), stream));
-  RAFT_CUDA_TRY(cudaMemsetAsync(cur_ex_scan.data(), 0, (nnz_t)n * sizeof(nnz_t), stream));
+  RAFT_CUDA_TRY(cudaMemsetAsync(ex_scan.data(), 0, static_cast<nnz_t>(n) * sizeof(nnz_t), stream));
+  RAFT_CUDA_TRY(
+    cudaMemsetAsync(cur_ex_scan.data(), 0, static_cast<nnz_t>(n) * sizeof(nnz_t), stream));
 
   thrust::device_ptr<nnz_t> dev_cnnz    = thrust::device_pointer_cast(cnnz);
   thrust::device_ptr<nnz_t> dev_ex_scan = thrust::device_pointer_cast(ex_scan.data());
@@ -153,8 +154,10 @@ void coo_remove_scalar(COO<T, idx_t, nnz_t>* in,
   rmm::device_uvector<nnz_t> row_count_nz(in->n_rows, stream);
   rmm::device_uvector<nnz_t> row_count(in->n_rows, stream);
 
-  RAFT_CUDA_TRY(cudaMemsetAsync(row_count_nz.data(), 0, (nnz_t)in->n_rows * sizeof(nnz_t), stream));
-  RAFT_CUDA_TRY(cudaMemsetAsync(row_count.data(), 0, (nnz_t)in->n_rows * sizeof(nnz_t), stream));
+  RAFT_CUDA_TRY(cudaMemsetAsync(
+    row_count_nz.data(), 0, static_cast<nnz_t>(in->n_rows) * sizeof(nnz_t), stream));
+  RAFT_CUDA_TRY(
+    cudaMemsetAsync(row_count.data(), 0, static_cast<nnz_t>(in->n_rows) * sizeof(nnz_t), stream));
 
   linalg::coo_degree(in->rows(), in->nnz, row_count.data(), stream);
   RAFT_CUDA_TRY(cudaPeekAtLastError());