Merge branch 'branch-25.04' into bm25-tfidf

rapidsai · Feb 19, 2025 · faef8ab · faef8ab
2 parents 4175a4d + 68d412a
commit faef8ab
Show file tree

Hide file tree

Showing 16 changed files with 424 additions and 46 deletions.
diff --git a/ci/build_docs.sh b/ci/build_docs.sh
@@ -3,35 +3,29 @@
 
 set -euo pipefail
 
+rapids-logger "Downloading artifacts from previous jobs"
+CPP_CHANNEL=$(rapids-download-conda-from-s3 cpp)
+PYTHON_CHANNEL=$(rapids-download-conda-from-s3 python)
+
 rapids-logger "Create test conda environment"
 . /opt/conda/etc/profile.d/conda.sh
 
-RAPIDS_VERSION="$(rapids-version)"
 RAPIDS_VERSION_MAJOR_MINOR="$(rapids-version-major-minor)"
 export RAPIDS_VERSION_MAJOR_MINOR
 
 rapids-dependency-file-generator \
   --output conda \
   --file-key docs \
-  --matrix "cuda=${RAPIDS_CUDA_VERSION%.*};arch=$(arch);py=${RAPIDS_PY_VERSION}" | tee env.yaml
+  --matrix "cuda=${RAPIDS_CUDA_VERSION%.*};arch=$(arch);py=${RAPIDS_PY_VERSION}" \
+  --prepend-channel "${CPP_CHANNEL}" \
+  --prepend-channel "${PYTHON_CHANNEL}" \
+  | tee env.yaml
 
 rapids-mamba-retry env create --yes -f env.yaml -n docs
 conda activate docs
 
 rapids-print-env
 
-rapids-logger "Downloading artifacts from previous jobs"
-CPP_CHANNEL=$(rapids-download-conda-from-s3 cpp)
-PYTHON_CHANNEL=$(rapids-download-conda-from-s3 python)
-
-rapids-mamba-retry install \
-  --channel "${CPP_CHANNEL}" \
-  --channel "${PYTHON_CHANNEL}" \
-  "libraft=${RAPIDS_VERSION}" \
-  "libraft-headers=${RAPIDS_VERSION}" \
-  "pylibraft=${RAPIDS_VERSION}" \
-  "raft-dask=${RAPIDS_VERSION}"
-
 RAPIDS_DOCS_DIR="$(mktemp -d)"
 export RAPIDS_DOCS_DIR
 

diff --git a/cpp/include/raft/cluster/detail/kmeans.cuh b/cpp/include/raft/cluster/detail/kmeans.cuh
@@ -43,6 +43,7 @@
 
 #include <cuda.h>
 #include <thrust/fill.h>
+#include <thrust/iterator/transform_iterator.h>
 #include <thrust/transform.h>
 
 #include <algorithm>
@@ -443,13 +444,12 @@ void kmeans_fit_main(raft::resources const& handle,
                                                         params.batch_centroids,
                                                         workspace);
 
-    // Using TransformInputIteratorT to dereference an array of
+    // Using thrust::transform_iterator to dereference an array of
     // raft::KeyValuePair and converting them to just return the Key to be used
     // in reduce_rows_by_key prims
     detail::KeyValueIndexOp<IndexT, DataT> conversion_op;
-    cub::TransformInputIterator<IndexT,
-                                detail::KeyValueIndexOp<IndexT, DataT>,
-                                raft::KeyValuePair<IndexT, DataT>*>
+    thrust::transform_iterator<detail::KeyValueIndexOp<IndexT, DataT>,
+                               raft::KeyValuePair<IndexT, DataT>*>
       itr(minClusterAndDistance.data_handle(), conversion_op);
 
     update_centroids(handle,

diff --git a/cpp/include/raft/cluster/detail/kmeans_balanced.cuh b/cpp/include/raft/cluster/detail/kmeans_balanced.cuh
@@ -47,6 +47,7 @@
 #include <rmm/resource_ref.hpp>
 
 #include <thrust/gather.h>
+#include <thrust/iterator/transform_iterator.h>
 #include <thrust/transform.h>
 
 #include <limits>
@@ -288,7 +289,8 @@ void calc_centers_and_sizes(const raft::resources& handle,
       dataset, dim, labels, nullptr, n_rows, dim, n_clusters, centers, stream, reset_counters);
   } else {
     // todo(lsugy): use iterator from KV output of fusedL2NN
-    cub::TransformInputIterator<MathT, MappingOpT, const T*> mapping_itr(dataset, mapping_op);
+    thrust::transform_iterator<MappingOpT, const T*, thrust::use_default, MathT> mapping_itr(
+      dataset, mapping_op);
     raft::linalg::reduce_rows_by_key(
       mapping_itr, dim, labels, nullptr, n_rows, dim, n_clusters, centers, stream, reset_counters);
   }
@@ -894,7 +896,8 @@ auto build_fine_clusters(const raft::resources& handle,
                    "Number of fine clusters must be non-zero for a non-empty mesocluster");
     }
 
-    cub::TransformInputIterator<MathT, MappingOpT, const T*> mapping_itr(dataset_mptr, mapping_op);
+    thrust::transform_iterator<MappingOpT, const T*, thrust::use_default, MathT> mapping_itr(
+      dataset_mptr, mapping_op);
     raft::matrix::gather(mapping_itr, dim, n_rows, mc_trainset_ids, k, mc_trainset, stream);
     if (params.metric == raft::distance::DistanceType::L2Expanded ||
         params.metric == raft::distance::DistanceType::L2SqrtExpanded) {

diff --git a/cpp/include/raft/cluster/detail/kmeans_common.cuh b/cpp/include/raft/cluster/detail/kmeans_common.cuh
@@ -43,6 +43,7 @@
 #include <cuda.h>
 #include <thrust/fill.h>
 #include <thrust/for_each.h>
+#include <thrust/iterator/transform_iterator.h>
 
 #include <algorithm>
 #include <cmath>
@@ -199,8 +200,8 @@ void computeClusterCost(raft::resources const& handle,
 {
   cudaStream_t stream = resource::get_cuda_stream(handle);
 
-  cub::TransformInputIterator<OutputT, MainOpT, InputT*> itr(minClusterDistance.data_handle(),
-                                                             main_op);
+  thrust::transform_iterator<MainOpT, InputT*, thrust::use_default, OutputT> itr(
+    minClusterDistance.data_handle(), main_op);
 
   size_t temp_storage_bytes = 0;
   RAFT_CUDA_TRY(cub::DeviceReduce::Reduce(nullptr,
@@ -641,13 +642,12 @@ void countSamplesInCluster(raft::resources const& handle,
                                        params.batch_centroids,
                                        workspace);
 
-  // Using TransformInputIteratorT to dereference an array of raft::KeyValuePair
+  // Using thrust::transform_iterator to dereference an array of raft::KeyValuePair
   // and converting them to just return the Key to be used in reduce_rows_by_key
   // prims
   detail::KeyValueIndexOp<IndexT, DataT> conversion_op;
-  cub::TransformInputIterator<IndexT,
-                              detail::KeyValueIndexOp<IndexT, DataT>,
-                              raft::KeyValuePair<IndexT, DataT>*>
+  thrust::transform_iterator<detail::KeyValueIndexOp<IndexT, DataT>,
+                             raft::KeyValuePair<IndexT, DataT>*>
     itr(minClusterAndDistance.data_handle(), conversion_op);
 
   // count # of samples in each cluster

diff --git a/cpp/include/raft/core/sparse_types.hpp b/cpp/include/raft/core/sparse_types.hpp
@@ -168,7 +168,7 @@ class sparse_matrix {
                 row_type n_rows,
                 col_type n_cols,
                 nnz_type nnz = 0) noexcept(std::is_nothrow_default_constructible_v<container_type>)
-    : structure_{handle, n_rows, n_cols, nnz}, cp_{}, c_elements_{cp_.create(handle, 0)} {};
+    : structure_{handle, n_rows, n_cols, nnz}, cp_{}, c_elements_{cp_.create(handle, nnz)} {};
 
   // Constructor that owns the data but not the structure
   // This constructor is only callable with a `structure_type == *_structure_view`

diff --git a/cpp/include/raft/neighbors/detail/ivf_pq_build.cuh b/cpp/include/raft/neighbors/detail/ivf_pq_build.cuh
@@ -53,6 +53,7 @@
 
 #include <cuda_fp16.h>
 #include <thrust/extrema.h>
+#include <thrust/iterator/transform_iterator.h>
 #include <thrust/scan.h>
 
 #include <memory>
@@ -180,8 +181,8 @@ void select_residuals(raft::resources const& handle,
   rmm::device_uvector<float> tmp(size_t(n_rows) * size_t(dim), stream, device_memory);
   // Note: the number of rows of the input dataset isn't actually n_rows, but matrix::gather doesn't
   // need to know it, any strictly positive number would work.
-  cub::TransformInputIterator<float, utils::mapping<float>, const T*> mapping_itr(
-    dataset, utils::mapping<float>{});
+  thrust::transform_iterator<utils::mapping<float>, const T*> mapping_itr(dataset,
+                                                                          utils::mapping<float>{});
   raft::matrix::gather(mapping_itr, (IdxT)dim, n_rows, row_ids, n_rows, tmp.data(), stream);
 
   raft::matrix::linewise_op(handle,

diff --git a/cpp/include/raft/sparse/linalg/detail/laplacian.cuh b/cpp/include/raft/sparse/linalg/detail/laplacian.cuh
@@ -0,0 +1,118 @@
+/*
+ * Copyright (c) 2025, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+#include <raft/core/detail/macros.hpp>
+#include <raft/core/device_csr_matrix.hpp>
+#include <raft/core/resources.hpp>
+
+#include <type_traits>
+
+namespace raft {
+namespace sparse {
+namespace linalg {
+namespace detail {
+
+/* Compute the graph Laplacian of an adjacency matrix
+ *
+ * This kernel implements the necessary logic for computing a graph
+ * Laplacian for an adjacency matrix in CSR format. A custom kernel is
+ * required because cusparse does not conveniently implement matrix subtraction with 64-bit
+ * indices. The custom kernel also allows the computation to be completed
+ * with no extra allocations or compute.
+ */
+template <typename ElementType, typename IndptrType, typename IndicesType>
+RAFT_KERNEL compute_graph_laplacian_kernel(ElementType* output_values,
+                                           IndicesType* output_indices,
+                                           IndptrType* output_indptr,
+                                           IndptrType dim,
+                                           ElementType const* adj_values,
+                                           IndicesType const* adj_indices,
+                                           IndptrType const* adj_indptr)
+{
+  /* The graph Laplacian L of an adjacency matrix A is given by:
+   * L = D - A
+   * where D is the degree matrix of A. The degree matrix is itself defined
+   * as the sum of each row of A and represents the degree of the node
+   * indicated by the index of the row. */
+
+  for (auto row = threadIdx.x + blockIdx.x * blockDim.x; row < dim; row += blockDim.x * gridDim.x) {
+    auto row_begin = adj_indptr[row];
+    auto row_end   = adj_indptr[row + 1];
+    // All output indexes will need to be offset by the row, since every row will
+    // gain exactly one new non-zero element. degree_output_index is the index
+    // where we will store the degree of each row
+    auto degree_output_index = row_begin + row;
+    auto degree_value        = ElementType{};
+    // value_index indicates the index of the current value in the original
+    // adjacency matrix
+    for (auto value_index = row_begin; value_index < row_end; ++value_index) {
+      auto col_index         = adj_indices[value_index];
+      auto is_lower_diagonal = col_index < row;
+      auto output_index      = value_index + row + !is_lower_diagonal;
+      auto input_value       = adj_values[value_index];
+      degree_value += input_value;
+      output_values[output_index]  = ElementType{-1} * input_value;
+      output_indices[output_index] = col_index;
+      // Increment the index where we will store the degree for every non-zero
+      // element before we reach the diagonal
+      degree_output_index += is_lower_diagonal;
+    }
+    output_values[degree_output_index]  = degree_value;
+    output_indices[degree_output_index] = row;
+    output_indptr[row]                  = row_begin + row;
+    output_indptr[row + 1]              = row_end + row + 1;
+  }
+}
+
+template <typename ElementType, typename IndptrType, typename IndicesType, typename NZType>
+auto compute_graph_laplacian(
+  raft::resources const& res,
+  device_csr_matrix_view<ElementType, IndptrType, IndicesType, NZType> input)
+{
+  auto input_structure = input.structure_view();
+  auto dim             = input_structure.get_n_rows();
+  RAFT_EXPECTS(dim == input_structure.get_n_cols(),
+               "The graph Laplacian can only be computed on a square adjacency matrix");
+  auto result = make_device_csr_matrix<std::remove_const_t<ElementType>,
+                                       std::remove_const_t<IndptrType>,
+                                       std::remove_const_t<IndicesType>,
+                                       std::remove_const_t<NZType>>(
+    res,
+    dim,
+    dim,
+    /* The nnz for the result will be the dimension of the (square) input matrix plus the number of
+     * non-zero elements in the original matrix, since we introduce non-zero elements along the
+     * diagonal to represent the degree of each node. */
+    input_structure.get_nnz() + dim);
+  auto result_structure                         = result.structure_view();
+  auto static constexpr const threads_per_block = 256;
+  auto blocks = std::min(int((dim + threads_per_block - 1) / threads_per_block), 65535);
+  auto stream = resource::get_cuda_stream(res);
+  detail::compute_graph_laplacian_kernel<<<threads_per_block, blocks, 0, stream>>>(
+    result.get_elements().data(),
+    result_structure.get_indices().data(),
+    result_structure.get_indptr().data(),
+    dim,
+    input.get_elements().data(),
+    input_structure.get_indices().data(),
+    input_structure.get_indptr().data());
+  return result;
+}
+
+}  // namespace detail
+}  // namespace linalg
+}  // namespace sparse
+}  // namespace raft
diff --git a/cpp/include/raft/sparse/linalg/laplacian.cuh b/cpp/include/raft/sparse/linalg/laplacian.cuh
@@ -0,0 +1,39 @@
+/*
+ * Copyright (c) 2025, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+#include <raft/core/device_csr_matrix.hpp>
+#include <raft/core/resources.hpp>
+#include <raft/sparse/linalg/detail/laplacian.cuh>
+
+namespace raft {
+namespace sparse {
+namespace linalg {
+
+/** Given a CSR adjacency matrix, return the graph Laplacian
+ *
+ * Note that for non-symmetric matrices, the out-degree Laplacian is returned.
+ */
+template <typename ElementType, typename IndptrType, typename IndicesType, typename NZType>
+auto compute_graph_laplacian(
+  raft::resources const& res,
+  device_csr_matrix_view<ElementType, IndptrType, IndicesType, NZType> input)
+{
+  return detail::compute_graph_laplacian(res, input);
+}
+
+}  // namespace linalg
+}  // namespace sparse
+}  // namespace raft
diff --git a/cpp/include/raft/spectral/detail/matrix_wrappers.hpp b/cpp/include/raft/spectral/detail/matrix_wrappers.hpp
@@ -15,6 +15,8 @@
  */
 #pragma once
 
+#include <raft/core/device_csr_matrix.hpp>
+#include <raft/core/device_span.hpp>
 #include <raft/core/resource/cublas_handle.hpp>
 #include <raft/core/resource/cuda_stream.hpp>
 #include <raft/core/resource/cusparse_handle.hpp>
@@ -33,6 +35,7 @@
 #include <thrust/system/cuda/execution_policy.h>
 
 #include <algorithm>
+#include <cstddef>
 
 // =========================================================
 // Useful macros
@@ -181,6 +184,19 @@ struct sparse_matrix_t {
   {
   }
 
+  auto to_csr_matrix_view() const
+  {
+    // The usage of sparse_matrix_t prior to introduction of this method
+    // assumed that all data was strictly on device. We will make the same
+    // assumption for construction of the csr_matrix_view
+    return device_csr_matrix_view<value_type const, index_type const, index_type const, index_type>{
+      device_span<value_type const>{values_, std::uint64_t(nnz_)},
+      device_compressed_structure_view<index_type const, index_type const, index_type>{
+        device_span<index_type const>{row_offsets_, std::uint64_t(nrows_ + 1)},
+        device_span<index_type const>{col_indices_, std::uint64_t(nnz_)},
+        ncols_}};
+  }
+
   virtual ~sparse_matrix_t(void) =
     default;  // virtual because used as base for following matrix types
 

diff --git a/cpp/include/raft/spectral/detail/partition.hpp b/cpp/include/raft/spectral/detail/partition.hpp
@@ -18,6 +18,7 @@
 #include <raft/core/resource/cublas_handle.hpp>
 #include <raft/core/resource/cuda_stream.hpp>
 #include <raft/linalg/detail/cublas_wrappers.hpp>
+#include <raft/sparse/linalg/laplacian.cuh>
 #include <raft/spectral/cluster_solvers.cuh>
 #include <raft/spectral/detail/spectral_util.cuh>
 #include <raft/spectral/eigen_solvers.cuh>
@@ -97,14 +98,15 @@ std::tuple<vertex_t, weight_t, vertex_t> partition(
   // Compute eigenvectors of Laplacian
 
   // Initialize Laplacian
-  /// sparse_matrix_t<vertex_t, weight_t> A{handle, graph};
-  spectral::matrix::laplacian_matrix_t<vertex_t, weight_t, nnz_t> L{handle, csr_m};
+  auto laplacian =
+    raft::sparse::linalg::compute_graph_laplacian(handle, csr_m.to_csr_matrix_view());
 
   auto eigen_config = eigen_solver.get_config();
   auto nEigVecs     = eigen_config.n_eigVecs;
 
   // Compute smallest eigenvalues and eigenvectors
-  std::get<0>(stats) = eigen_solver.solve_smallest_eigenvectors(handle, L, eigVals, eigVecs);
+  std::get<0>(stats) =
+    eigen_solver.solve_smallest_eigenvectors(handle, laplacian.view(), eigVals, eigVecs);
 
   // Whiten eigenvector matrix
   transform_eigen_matrix(handle, n, nEigVecs, eigVecs);