Merge branch 'rapidsai:branch-24.10' into umap-batch-nnd

rapidsai · Aug 21, 2024 · b9b9dd6 · b9b9dd6
2 parents 2cb4623 + d09e713
commit b9b9dd6
Show file tree

Hide file tree

Showing 15 changed files with 218 additions and 82 deletions.
diff --git a/ci/build_wheel.sh b/ci/build_wheel.sh
@@ -18,14 +18,33 @@ rapids-generate-version > ./VERSION
 
 cd ${package_dir}
 
-SKBUILD_CMAKE_ARGS="-DDETECT_CONDA_ENV=OFF;-DDISABLE_DEPRECATION_WARNINGS=ON;-DCPM_cumlprims_mg_SOURCE=${GITHUB_WORKSPACE}/cumlprims_mg/" \
+case "${RAPIDS_CUDA_VERSION}" in
+  12.*)
+    EXCLUDE_ARGS=(
+      --exclude "libcublas.so.12"
+      --exclude "libcublasLt.so.12"
+      --exclude "libcufft.so.11"
+      --exclude "libcurand.so.10"
+      --exclude "libcusolver.so.11"
+      --exclude "libcusparse.so.12"
+      --exclude "libnvJitLink.so.12"
+    )
+    EXTRA_CMAKE_ARGS=";-DUSE_CUDA_MATH_WHEELS=ON"
+    ;;
+  11.*)
+    EXCLUDE_ARGS=()
+    EXTRA_CMAKE_ARGS=";-DUSE_CUDA_MATH_WHEELS=OFF"
+    ;;
+esac
+
+SKBUILD_CMAKE_ARGS="-DDETECT_CONDA_ENV=OFF;-DDISABLE_DEPRECATION_WARNINGS=ON;-DCPM_cumlprims_mg_SOURCE=${GITHUB_WORKSPACE}/cumlprims_mg/${EXTRA_CMAKE_ARGS}" \
   python -m pip wheel . \
     -w dist \
     -vvv \
     --no-deps \
     --disable-pip-version-check
 
 mkdir -p final_dist
-python -m auditwheel repair -w final_dist dist/*
+python -m auditwheel repair -w final_dist "${EXCLUDE_ARGS[@]}" dist/*
 
 RAPIDS_PY_WHEEL_NAME="cuml_${RAPIDS_PY_CUDA_SUFFIX}" rapids-upload-wheels-to-s3 final_dist
diff --git a/ci/release/update-version.sh b/ci/release/update-version.sh
@@ -18,7 +18,7 @@ CURRENT_MINOR=$(echo $CURRENT_TAG | awk '{split($0, a, "."); print a[2]}')
 CURRENT_PATCH=$(echo $CURRENT_TAG | awk '{split($0, a, "."); print a[3]}')
 CURRENT_SHORT_TAG=${CURRENT_MAJOR}.${CURRENT_MINOR}
 
-#Get <major>.<minor> for next version
+# Get <major>.<minor> for next version
 NEXT_MAJOR=$(echo $NEXT_FULL_TAG | awk '{split($0, a, "."); print a[1]}')
 NEXT_MINOR=$(echo $NEXT_FULL_TAG | awk '{split($0, a, "."); print a[2]}')
 NEXT_SHORT_TAG=${NEXT_MAJOR}.${NEXT_MINOR}
@@ -58,7 +58,7 @@ for DEP in "${DEPENDENCIES[@]}"; do
     sed_runner "/-.* ${DEP}\(-cu[[:digit:]]\{2\}\)\{0,1\}==/ s/==.*/==${NEXT_SHORT_TAG_PEP440}.*,>=0.0.0a0/g" "${FILE}"
   done
   for FILE in python/*/pyproject.toml; do
-    sed_runner "/\"${DEP}==/ s/==.*\"/==${NEXT_SHORT_TAG_PEP440}.*,>=0.0.0a0\"/g" ${FILE}
+    sed_runner "/\"${DEP}==/ s/==.*\"/==${NEXT_SHORT_TAG_PEP440}.*,>=0.0.0a0\"/g" "${FILE}"
   done
 done
 

diff --git a/conda/environments/all_cuda-118_arch-x86_64.yaml b/conda/environments/all_cuda-118_arch-x86_64.yaml
@@ -22,7 +22,7 @@ dependencies:
 - doxygen=1.9.1
 - gcc_linux-64=11.*
 - graphviz
-- hdbscan<=0.8.30
+- hdbscan>=0.8.38,<0.8.39
 - hypothesis>=6.0,<7
 - ipykernel
 - ipython

diff --git a/conda/environments/all_cuda-125_arch-x86_64.yaml b/conda/environments/all_cuda-125_arch-x86_64.yaml
@@ -24,7 +24,7 @@ dependencies:
 - doxygen=1.9.1
 - gcc_linux-64=11.*
 - graphviz
-- hdbscan<=0.8.30
+- hdbscan>=0.8.38,<0.8.39
 - hypothesis>=6.0,<7
 - ipykernel
 - ipython

diff --git a/conda/recipes/cuml-cpu/meta.yaml b/conda/recipes/cuml-cpu/meta.yaml
@@ -34,7 +34,7 @@ requirements:
     - numpy>=1.23,<2.0a0
     - pandas
     - scikit-learn=1.2
-    - hdbscan<=0.8.30
+    - hdbscan>=0.8.38,<0.8.39
     - umap-learn=0.5.3
     - nvtx
 

diff --git a/cpp/src/fil/common.cuh b/cpp/src/fil/common.cuh
@@ -309,17 +309,12 @@ struct compute_smem_footprint : dispatch_functor<int> {
   int run(predict_params);
 };
 
-template <int NITEMS,
-          leaf_algo_t leaf_algo,
-          bool cols_in_shmem,
-          bool CATS_SUPPORTED,
-          class storage_type>
-__attribute__((visibility("hidden"))) __global__ void infer_k(storage_type forest,
-                                                              predict_params params);
-
 // infer() calls the inference kernel with the parameters on the stream
 template <typename storage_type>
 void infer(storage_type forest, predict_params params, cudaStream_t stream);
 
+template <typename storage_type>
+void infer_shared_mem_size(predict_params params, int max_shm);
+
 }  // namespace fil
 }  // namespace ML
diff --git a/cpp/src/fil/fil.cu b/cpp/src/fil/fil.cu
@@ -349,26 +349,6 @@ struct forest {
   cat_sets_device_owner cat_sets_;
 };
 
-template <typename storage_type>
-struct opt_into_arch_dependent_shmem : dispatch_functor<void> {
-  const int max_shm;
-  opt_into_arch_dependent_shmem(int max_shm_) : max_shm(max_shm_) {}
-
-  template <typename KernelParams = KernelTemplateParams<>>
-  void run(predict_params p)
-  {
-    auto kernel = infer_k<KernelParams::N_ITEMS,
-                          KernelParams::LEAF_ALGO,
-                          KernelParams::COLS_IN_SHMEM,
-                          KernelParams::CATS_SUPPORTED,
-                          storage_type>;
-    // p.shm_sz might be > max_shm or < MAX_SHM_STD, but we should not check for either, because
-    // we don't run on both proba_ssp_ and class_ssp_ (only class_ssp_). This should be quick.
-    RAFT_CUDA_TRY(
-      cudaFuncSetAttribute(kernel, cudaFuncAttributeMaxDynamicSharedMemorySize, max_shm));
-  }
-};
-
 template <typename real_t>
 struct dense_forest<dense_node<real_t>> : forest<real_t> {
   using node_t = dense_node<real_t>;
@@ -427,8 +407,9 @@ struct dense_forest<dense_node<real_t>> : forest<real_t> {
                                   h.get_stream()));
 
     // predict_proba is a runtime parameter, and opt-in is unconditional
-    dispatch_on_fil_template_params(opt_into_arch_dependent_shmem<storage<node_t>>(this->max_shm_),
-                                    static_cast<predict_params>(this->class_ssp_));
+    fil::infer_shared_mem_size<storage<node_t>>(static_cast<predict_params>(this->class_ssp_),
+                                                this->max_shm_);
+
     // copy must be finished before freeing the host data
     h.sync_stream();
     h_nodes_.clear();
@@ -491,8 +472,8 @@ struct sparse_forest : forest<typename node_t::real_type> {
       nodes_.data(), nodes, sizeof(node_t) * num_nodes_, cudaMemcpyHostToDevice, h.get_stream()));
 
     // predict_proba is a runtime parameter, and opt-in is unconditional
-    dispatch_on_fil_template_params(opt_into_arch_dependent_shmem<storage<node_t>>(this->max_shm_),
-                                    static_cast<predict_params>(this->class_ssp_));
+    fil::infer_shared_mem_size<storage<node_t>>(static_cast<predict_params>(this->class_ssp_),
+                                                this->max_shm_);
   }
 
   virtual void infer(predict_params params, cudaStream_t stream) override

diff --git a/cpp/src/fil/infer.cu b/cpp/src/fil/infer.cu
@@ -908,12 +908,38 @@ struct infer_k_storage_template : dispatch_functor<void> {
   }
 };
 
+template <typename storage_type>
+struct opt_into_arch_dependent_shmem : dispatch_functor<void> {
+  const int max_shm;
+  opt_into_arch_dependent_shmem(int max_shm_) : max_shm(max_shm_) {}
+
+  template <typename KernelParams = KernelTemplateParams<>>
+  void run(predict_params p)
+  {
+    auto kernel = infer_k<KernelParams::N_ITEMS,
+                          KernelParams::LEAF_ALGO,
+                          KernelParams::COLS_IN_SHMEM,
+                          KernelParams::CATS_SUPPORTED,
+                          storage_type>;
+    // p.shm_sz might be > max_shm or < MAX_SHM_STD, but we should not check for either, because
+    // we don't run on both proba_ssp_ and class_ssp_ (only class_ssp_). This should be quick.
+    RAFT_CUDA_TRY(
+      cudaFuncSetAttribute(kernel, cudaFuncAttributeMaxDynamicSharedMemorySize, max_shm));
+  }
+};
+
 template <typename storage_type>
 void infer(storage_type forest, predict_params params, cudaStream_t stream)
 {
   dispatch_on_fil_template_params(infer_k_storage_template<storage_type>(forest, stream), params);
 }
 
+template <typename storage_type>
+void infer_shared_mem_size(predict_params params, int max_shm)
+{
+  dispatch_on_fil_template_params(opt_into_arch_dependent_shmem<storage_type>(max_shm), params);
+}
+
 template void infer<dense_storage_f32>(dense_storage_f32 forest,
                                        predict_params params,
                                        cudaStream_t stream);
@@ -930,5 +956,11 @@ template void infer<sparse_storage8>(sparse_storage8 forest,
                                      predict_params params,
                                      cudaStream_t stream);
 
+template void infer_shared_mem_size<dense_storage_f32>(predict_params params, int max_shm);
+template void infer_shared_mem_size<dense_storage_f64>(predict_params params, int max_shm);
+template void infer_shared_mem_size<sparse_storage16_f32>(predict_params params, int max_shm);
+template void infer_shared_mem_size<sparse_storage16_f64>(predict_params params, int max_shm);
+template void infer_shared_mem_size<sparse_storage8>(predict_params params, int max_shm);
+
 }  // namespace fil
 }  // namespace ML
diff --git a/dependencies.yaml b/dependencies.yaml
@@ -88,6 +88,7 @@ files:
     extras:
       table: project
     includes:
+      - cuda
       - py_run
   py_test:
     output: pyproject
@@ -176,7 +177,7 @@ dependencies:
     common:
       - output_types: [conda, requirements, pyproject]
         packages:
-          - cython>=3.0.0
+          - &cython cython>=3.0.0
           - &treelite treelite==4.3.0
       - output_types: conda
         packages:
@@ -406,6 +407,26 @@ dependencies:
               - *libcusolver114
               - *libcusparse_dev114
               - *libcusparse114
+      - output_types: pyproject
+        matrices:
+          - matrix:
+              cuda: "12.*"
+            packages:
+              - nvidia-cublas-cu12
+              - nvidia-cufft-cu12
+              - nvidia-curand-cu12
+              - nvidia-cusparse-cu12
+              - nvidia-cusolver-cu12
+          - matrix:
+              cuda: "11.*"
+            packages:
+          - matrix:
+            packages:
+              - nvidia-cublas
+              - nvidia-cufft
+              - nvidia-curand
+              - nvidia-cusparse
+              - nvidia-cusolver
   docs:
     common:
       - output_types: [conda, requirements]
@@ -465,7 +486,9 @@ dependencies:
     common:
       - output_types: [conda, requirements, pyproject]
         packages:
+          - *cython
           - dask-ml
+          - hdbscan>=0.8.38,<0.8.39
           - hypothesis>=6.0,<7
           - nltk
           - numpydoc
@@ -484,14 +507,9 @@ dependencies:
           - pip
           - pip:
               - dask-glm==0.3.0
-          # TODO: remove pin once a release that includes fixes for the error
-          # is released: https://github.com/rapidsai/cuml/issues/5514
-          - hdbscan<=0.8.30
       - output_types: pyproject
         packages:
           - dask-glm==0.3.0
-            # TODO: Can we stop pulling from the master branch now that there was a release in October?
-          - hdbscan @ git+https://github.com/scikit-learn-contrib/hdbscan.git@master
   test_notebooks:
     common:
       - output_types: [conda, requirements]

diff --git a/python/cuml/CMakeLists.txt b/python/cuml/CMakeLists.txt
@@ -38,6 +38,7 @@ project(
 option(CUML_UNIVERSAL "Build all cuML Python components." ON)
 option(FIND_CUML_CPP "Search for existing CUML C++ installations before defaulting to local files" OFF)
 option(SINGLEGPU "Disable all mnmg components and comms libraries" OFF)
+option(USE_CUDA_MATH_WHEELS "Use the CUDA math wheels instead of the system libraries" OFF)
 set(CUML_RAFT_CLONE_ON_PIN OFF)
 
 
@@ -72,8 +73,10 @@ include(rapids-cython-core)
 
 set(CUML_PYTHON_TREELITE_TARGET treelite::treelite)
 
-if(NOT ${CUML_CPU})
+if(NOT CUML_CPU)
   if(NOT cuml_FOUND)
+    find_package(CUDAToolkit REQUIRED)
+
     set(BUILD_CUML_TESTS OFF)
     set(BUILD_PRIMS_TESTS OFF)
     set(BUILD_CUML_C_LIBRARY OFF)
@@ -85,11 +88,19 @@ if(NOT ${CUML_CPU})
 
     # Statically link dependencies if building wheels
     set(CUDA_STATIC_RUNTIME ON)
-    set(CUDA_STATIC_MATH_LIBRARIES ON)
     set(CUML_USE_RAFT_STATIC ON)
     set(CUML_USE_FAISS_STATIC ON)
     set(CUML_USE_TREELITE_STATIC ON)
     set(CUML_USE_CUMLPRIMS_MG_STATIC ON)
+    # Link to the CUDA wheels with shared libraries for CUDA 12+
+    if(CUDAToolkit_VERSION VERSION_GREATER_EQUAL 12.0)
+      set(CUDA_STATIC_MATH_LIBRARIES OFF)
+    else()
+      if(USE_CUDA_MATH_WHEELS)
+        message(FATAL_ERROR "Cannot use CUDA math wheels with CUDA < 12.0")
+      endif()
+      set(CUDA_STATIC_MATH_LIBRARIES ON)
+    endif()
     # Don't install the static libs into wheels
     set(CUML_EXCLUDE_RAFT_FROM_ALL ON)
     set(RAFT_EXCLUDE_FAISS_FROM_ALL ON)
@@ -98,6 +109,17 @@ if(NOT ${CUML_CPU})
 
     add_subdirectory(${CUML_CPP_SRC} cuml-cpp EXCLUDE_FROM_ALL)
 
+    if(NOT CUDA_STATIC_MATH_LIBRARIES AND USE_CUDA_MATH_WHEELS)
+      set_property(TARGET ${CUML_CPP_TARGET} PROPERTY INSTALL_RPATH
+        "$ORIGIN/../nvidia/cublas/lib"
+        "$ORIGIN/../nvidia/cufft/lib"
+        "$ORIGIN/../nvidia/curand/lib"
+        "$ORIGIN/../nvidia/cusolver/lib"
+        "$ORIGIN/../nvidia/cusparse/lib"
+        "$ORIGIN/../nvidia/nvjitlink/lib"
+      )
+    endif()
+
     set(cython_lib_dir cuml)
     install(TARGETS ${CUML_CPP_TARGET} DESTINATION ${cython_lib_dir})
   endif()