progress so far

rapidsai · Jan 16, 2025 · dc1ca34 · dc1ca34
1 parent 2e06ab9
commit dc1ca34
Show file tree

Hide file tree

Showing 6 changed files with 68 additions and 61 deletions.
diff --git a/cpp/include/raft/linalg/detail/transpose.cuh b/cpp/include/raft/linalg/detail/transpose.cuh
@@ -34,7 +34,7 @@ namespace raft {
 namespace linalg {
 namespace detail {
 
-template <typename IndexType, int TILE_DIM, int BLOCK_ROWS>
+template <typename IndexType, uint64_t TILE_DIM, uint64_t BLOCK_ROWS>
 RAFT_KERNEL transpose_half_kernel(IndexType n_rows,
                                   IndexType n_cols,
                                   const half* __restrict__ in,
@@ -44,10 +44,10 @@ RAFT_KERNEL transpose_half_kernel(IndexType n_rows,
 {
   __shared__ half tile[TILE_DIM][TILE_DIM + 1];
 
-  for (int block_offset_y = 0; block_offset_y < n_rows; block_offset_y += gridDim.y * TILE_DIM) {
-    for (int block_offset_x = 0; block_offset_x < n_cols; block_offset_x += gridDim.x * TILE_DIM) {
-      auto x = block_offset_x + blockIdx.x * TILE_DIM + threadIdx.x;
-      auto y = block_offset_y + blockIdx.y * TILE_DIM + threadIdx.y;
+  for (uint64_t block_offset_y = 0; block_offset_y < n_rows; block_offset_y += gridDim.y * TILE_DIM) {
+    for (uint64_t block_offset_x = 0; block_offset_x < n_cols; block_offset_x += gridDim.x * TILE_DIM) {
+      uint64_t x = block_offset_x + blockIdx.x * TILE_DIM + threadIdx.x;
+      uint64_t y = block_offset_y + blockIdx.y * TILE_DIM + threadIdx.y;
 
       for (int j = 0; j < TILE_DIM; j += BLOCK_ROWS) {
         if (x < n_cols && (y + j) < n_rows) {
@@ -117,7 +117,7 @@ void transpose_half(raft::resources const& handle,
   int num_blocks = max_active_blocks * sm_count;
 
   int grid_x = (n_cols + block_dim_x - 1) / block_dim_x;
-  int grid_y = (n_rows + block_dim_x - 1) / block_dim_x;
+  int grid_y = ((uint64_t)n_rows + block_dim_x - 1) / block_dim_x;
 
   float ratio = static_cast<float>(grid_y) / static_cast<float>(grid_x);
   int adjusted_grid_y =

diff --git a/cpp/include/raft/sparse/linalg/detail/spectral.cuh b/cpp/include/raft/sparse/linalg/detail/spectral.cuh
@@ -35,8 +35,8 @@ void fit_embedding(raft::resources const& handle,
                    int* rows,
                    int* cols,
                    T* vals,
-                   int nnz,
-                   int n,
+                   uint64_t nnz,
+                   uint64_t n,
                    int n_components,
                    T* out,
                    unsigned long long seed = 1234567)

diff --git a/cpp/include/raft/sparse/linalg/detail/symmetrize.cuh b/cpp/include/raft/sparse/linalg/detail/symmetrize.cuh
@@ -104,7 +104,7 @@ RAFT_KERNEL coo_symmetrize_kernel(uint64_t* row_ind,
       // Note that if we did find a match, we don't need to
       // compute `res` on it here because it will be computed
       // in a different thread.
-      if (!found_match && vals[idx] != 0.0) {
+      if (!found_match && cur_val != 0.0) {
         orows[out_start_idx + row_nnz] = cur_col;
         ocols[out_start_idx + row_nnz] = cur_row;
         ovals[out_start_idx + row_nnz] = res;

diff --git a/cpp/include/raft/sparse/linalg/spectral.cuh b/cpp/include/raft/sparse/linalg/spectral.cuh
@@ -28,7 +28,7 @@ void fit_embedding(raft::resources const& handle,
                    int* rows,
                    int* cols,
                    T* vals,
-                   int nnz,
+                   uint64_t nnz,
                    int n,
                    int n_components,
                    T* out,
@@ -40,4 +40,4 @@ void fit_embedding(raft::resources const& handle,
 };  // namespace sparse
 };  // namespace raft
 
-#endif
+#endif
diff --git a/cpp/include/raft/sparse/op/detail/filter.cuh b/cpp/include/raft/sparse/op/detail/filter.cuh
@@ -43,34 +43,42 @@ namespace op {
 namespace detail {
 
 template <uint64_t TPB_X, typename T>
-RAFT_KERNEL coo_remove_scalar_kernel(const int* rows,
-                                     const int* cols,
-                                     const T* vals,
+RAFT_KERNEL coo_remove_scalar_kernel(const int* in_rows,
+                                     const int* in_cols,
+                                     const T* in_vals,
                                      uint64_t nnz,
-                                     int* crows,
-                                     int* ccols,
-                                     T* cvals,
-                                     uint64_t* ex_scan,
-                                     uint64_t* cur_ex_scan,
-                                     int m,
-                                     T scalar)
+                                     int* out_rows,
+                                     int* out_cols,
+                                     T* out_vals,
+                                     uint64_t* row_indices,
+				     int* rows_lenght_acc,
+                                     T scalar,
+				     int n_rows)
 {
-  int row = (blockIdx.x * TPB_X) + threadIdx.x;
-
-  if (row < m) {
-    uint64_t start       = cur_ex_scan[row];
-    uint64_t stop        = get_stop_idx(row, m, nnz, cur_ex_scan);
-    uint64_t cur_out_idx = ex_scan[row];
-
-    for (uint64_t idx = start; idx < stop; idx++) {
-      if (vals[idx] != scalar) {
-        crows[cur_out_idx] = rows[idx];
-        ccols[cur_out_idx] = cols[idx];
-        cvals[cur_out_idx] = vals[idx];
-        ++cur_out_idx;
-      }
-    }
-  }
+  uint64_t in_idx = (blockIdx.x * TPB_X) + threadIdx.x;
+
+  if (in_idx >= nnz)
+    return;
+
+  int val = in_vals[in_idx];
+
+  if (val == scalar)
+    return;
+
+  int row = in_rows[in_idx];
+
+  if (row >= n_rows)
+    printf("row >= n_rows : %d\n", row);
+
+  uint64_t row_start_index = row_indices[row];
+  uint64_t out_idx = row_start_index + atomicAdd(rows_lenght_acc + row, 1);
+
+  if (out_idx >= nnz)
+    printf("out_idx >= nnz: %lu\n", out_idx);
+
+  out_rows[out_idx] = row;
+  out_cols[out_idx] = in_cols[in_idx];
+  out_vals[out_idx] = val;
 }
 
 /**
@@ -90,7 +98,7 @@ RAFT_KERNEL coo_remove_scalar_kernel(const int* rows,
  * @param d_alloc device allocator for temporary buffers
  * @param stream: cuda stream to use
  */
-template <int TPB_X, typename T>
+template <uint64_t TPB_X, typename T>
 void coo_remove_scalar(const int* rows,
                        const int* cols,
                        const T* vals,
@@ -99,28 +107,31 @@ void coo_remove_scalar(const int* rows,
                        int* ccols,
                        T* cvals,
                        int* cnnz,
-                       int* cur_cnnz,
                        T scalar,
                        int n,
                        cudaStream_t stream)
 {
   rmm::device_uvector<uint64_t> ex_scan(n, stream);
-  rmm::device_uvector<uint64_t> cur_ex_scan(n, stream);
-
-  RAFT_CUDA_TRY(cudaMemsetAsync(ex_scan.data(), 0, (uint64_t)n * sizeof(int), stream));
-  RAFT_CUDA_TRY(cudaMemsetAsync(cur_ex_scan.data(), 0, (uint64_t)n * sizeof(int), stream));
+  RAFT_CUDA_TRY(cudaMemsetAsync(ex_scan.data(), 0, (uint64_t)n * sizeof(uint64_t), stream));
 
   thrust::device_ptr<int> dev_cnnz    = thrust::device_pointer_cast(cnnz);
   thrust::device_ptr<uint64_t> dev_ex_scan = thrust::device_pointer_cast(ex_scan.data());
   thrust::exclusive_scan(rmm::exec_policy(stream), dev_cnnz, dev_cnnz + n, dev_ex_scan);
   RAFT_CUDA_TRY(cudaPeekAtLastError());
 
-  thrust::device_ptr<int> dev_cur_cnnz    = thrust::device_pointer_cast(cur_cnnz);
-  thrust::device_ptr<uint64_t> dev_cur_ex_scan = thrust::device_pointer_cast(cur_ex_scan.data());
-  thrust::exclusive_scan(rmm::exec_policy(stream), dev_cur_cnnz, dev_cur_cnnz + n, dev_cur_ex_scan);
-  RAFT_CUDA_TRY(cudaPeekAtLastError());
+  raft::print_device_vector("ex_scan", ex_scan.data(), 20, std::cout);
+
+  auto maxiter_ex_scan = thrust::max_element(dev_ex_scan, dev_ex_scan + n);
+  uint64_t max_ex_scan = *maxiter_ex_scan;
+
+  std::cout << "max_ex_scan: " << max_ex_scan << std::endl;
+  std::cout << "nnz: " << nnz << std::endl;
+  std::cout << "n: " << n << std::endl;
+
+  rmm::device_uvector<int> rows_length_acc(n, stream);
+  RAFT_CUDA_TRY(cudaMemsetAsync(rows_length_acc.data(), 0, (uint64_t)n * sizeof(int), stream));
 
-  dim3 grid(raft::ceildiv(n, TPB_X), 1, 1);
+  dim3 grid(raft::ceildiv(nnz, TPB_X), 1, 1);
   dim3 blk(TPB_X, 1, 1);
 
   coo_remove_scalar_kernel<TPB_X><<<grid, blk, 0, stream>>>(rows,
@@ -131,9 +142,9 @@ void coo_remove_scalar(const int* rows,
                                                             ccols,
                                                             cvals,
                                                             dev_ex_scan.get(),
-                                                            dev_cur_ex_scan.get(),
-                                                            n,
-                                                            scalar);
+							    rows_length_acc.data(),
+                                                            scalar,
+							    n);
   RAFT_CUDA_TRY(cudaPeekAtLastError());
 }
 
@@ -149,20 +160,17 @@ template <int TPB_X, typename T>
 void coo_remove_scalar(COO<T>* in, COO<T>* out, T scalar, cudaStream_t stream)
 {
   rmm::device_uvector<int> row_count_nz(in->n_rows, stream);
-  rmm::device_uvector<int> row_count(in->n_rows, stream);
-
   RAFT_CUDA_TRY(cudaMemsetAsync(row_count_nz.data(), 0, (uint64_t)in->n_rows * sizeof(int), stream));
-  RAFT_CUDA_TRY(cudaMemsetAsync(row_count.data(), 0, (uint64_t)in->n_rows * sizeof(int), stream));
-
-  linalg::coo_degree(in->rows(), in->nnz, row_count.data(), stream);
-  RAFT_CUDA_TRY(cudaPeekAtLastError());
 
   linalg::coo_degree_scalar(in->rows(), in->vals(), in->nnz, scalar, row_count_nz.data(), stream);
   RAFT_CUDA_TRY(cudaPeekAtLastError());
 
   thrust::device_ptr<int> d_row_count_nz = thrust::device_pointer_cast(row_count_nz.data());
   uint64_t out_nnz = thrust::reduce(rmm::exec_policy(stream), d_row_count_nz, d_row_count_nz + in->n_rows, (uint64_t)0);
 
+  raft::print_device_vector("row_count_nz", row_count_nz.data(), 20, std::cout);
+  std::cout << "!!! out_nnz: " << out_nnz << std::endl;
+
   out->allocate(out_nnz, in->n_rows, in->n_cols, false, stream);
 
   coo_remove_scalar<TPB_X, T>(in->rows(),
@@ -173,7 +181,6 @@ void coo_remove_scalar(COO<T>* in, COO<T>* out, T scalar, cudaStream_t stream)
                               out->cols(),
                               out->vals(),
                               row_count_nz.data(),
-                              row_count.data(),
                               scalar,
                               in->n_rows,
                               stream);

diff --git a/cpp/include/raft/sparse/solver/detail/lanczos.cuh b/cpp/include/raft/sparse/solver/detail/lanczos.cuh
@@ -151,7 +151,7 @@ int performLanczosIteration(raft::resources const& handle,
 
   RAFT_EXPECTS(A != nullptr, "Null matrix pointer.");
 
-  index_type_t n = A->nrows_;
+  uint64_t n = A->nrows_;
 
   // -------------------------------------------------------
   // Compute second Lanczos vector
@@ -814,7 +814,7 @@ int computeSmallestEigenvectors(
   constexpr value_type_t zero = 0;
 
   // Matrix dimension
-  index_type_t n = A->nrows_;
+  uint64_t n = A->nrows_;
 
   // Shift for implicit restart
   value_type_t shiftUpper;
@@ -1160,7 +1160,7 @@ int computeLargestEigenvectors(
   constexpr value_type_t zero = 0;
 
   // Matrix dimension
-  index_type_t n = A->nrows_;
+  uint64_t n = A->nrows_;
 
   // Lanczos iteration counters
   index_type_t maxIter_curr = restartIter;  // Maximum size of Lanczos system