Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix UMAP issues with large inputs #6245

Open
wants to merge 24 commits into
base: branch-25.04
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion conda/environments/all_cuda-118_arch-x86_64.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -76,7 +76,7 @@ dependencies:
- sphinx-markdown-tables
- statsmodels
- sysroot_linux-64==2.28
- treelite==4.4.1
- treelite==4.3.0
- umap-learn==0.5.6
- xgboost>=2.1.0
name: all_cuda-118_arch-x86_64
2 changes: 1 addition & 1 deletion conda/environments/all_cuda-128_arch-x86_64.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -72,7 +72,7 @@ dependencies:
- sphinx-markdown-tables
- statsmodels
- sysroot_linux-64==2.28
- treelite==4.4.1
- treelite==4.3.0
- umap-learn==0.5.6
- xgboost>=2.1.0
name: all_cuda-128_arch-x86_64
2 changes: 1 addition & 1 deletion conda/recipes/cuml/conda_build_config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -20,4 +20,4 @@ c_stdlib_version:
- "=2.28"

treelite_version:
- "=4.4.1"
- "=4.3.0"
2 changes: 1 addition & 1 deletion conda/recipes/libcuml/conda_build_config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ spdlog_version:
- ">=1.14.1,<1.15"

treelite_version:
- "=4.4.1"
- "=4.3.0"

# The CTK libraries below are missing from the conda-forge::cudatoolkit package
# for CUDA 11. The "*_host_*" version specifiers correspond to `11.8` packages
Expand Down
4 changes: 2 additions & 2 deletions cpp/cmake/thirdparty/get_cuvs.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -73,8 +73,8 @@ endfunction()
# To use a different CUVS locally, set the CMake variable
# CPM_cuvs_SOURCE=/path/to/local/cuvs
find_and_configure_cuvs(VERSION ${CUML_MIN_VERSION_cuvs}
FORK rapidsai
PINNED_TAG branch-${CUML_BRANCH_VERSION_cuvs}
FORK divyegala
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Just noting that we'll need to make sure to switch these back once the associated PRs have been merged so we don't forget.

PINNED_TAG raft-sparse-updates
Comment on lines +76 to +77
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Reminder to revert this before merge.

EXCLUDE_FROM_ALL ${CUML_EXCLUDE_CUVS_FROM_ALL}
# When PINNED_TAG above doesn't match cuml,
# force local cuvs clone in build directory
Expand Down
4 changes: 2 additions & 2 deletions cpp/cmake/thirdparty/get_raft.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -72,8 +72,8 @@ endfunction()
# To use a different RAFT locally, set the CMake variable
# CPM_raft_SOURCE=/path/to/local/raft
find_and_configure_raft(VERSION ${CUML_MIN_VERSION_raft}
FORK rapidsai
PINNED_TAG branch-${CUML_BRANCH_VERSION_raft}
FORK viclafargue
PINNED_TAG fix-sparse-utilities
Comment on lines +75 to +76
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Reminder to revert this before merge.

EXCLUDE_FROM_ALL ${CUML_EXCLUDE_RAFT_FROM_ALL}
# When PINNED_TAG above doesn't match cuml,
# force local raft clone in build directory
Expand Down
4 changes: 2 additions & 2 deletions cpp/cmake/thirdparty/get_treelite.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -79,7 +79,7 @@ function(find_and_configure_treelite)
rapids_export_find_package_root(BUILD Treelite [=[${CMAKE_CURRENT_LIST_DIR}]=] EXPORT_SET cuml-exports)
endfunction()

find_and_configure_treelite(VERSION 4.4.1
PINNED_TAG 386bd0de99f5a66584c7e58221ee38ce606ad1ae
find_and_configure_treelite(VERSION 4.3.0
PINNED_TAG 575e4208f2b18e40d818c338ecb95d7a26e69aab
Comment on lines +82 to +83
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

What is the motivation for the downgrade?

EXCLUDE_FROM_ALL ${CUML_EXCLUDE_TREELITE_FROM_ALL}
BUILD_STATIC_LIBS ${CUML_USE_TREELITE_STATIC})
8 changes: 5 additions & 3 deletions cpp/include/cuml/manifold/common.hpp
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2020-2022, NVIDIA CORPORATION.
* Copyright (c) 2020-2025, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand All @@ -16,6 +16,8 @@

#pragma once

#include <stdint.h>

namespace ML {

// Dense input uses int64_t until FAISS is updated
Expand Down Expand Up @@ -55,8 +57,8 @@ struct knn_graph {
template <typename T>
struct manifold_inputs_t {
T* y;
int n;
int d;
uint64_t n;
viclafargue marked this conversation as resolved.
Show resolved Hide resolved
uint64_t d;

manifold_inputs_t(T* y_, int n_, int d_) : y(y_), n(n_), d(d_) {}

Expand Down
3 changes: 2 additions & 1 deletion cpp/src/tsne/tsne_runner.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,7 @@

#include <cuvs/distance/distance.hpp>
#include <pca/pca.cuh>
#include <stdint.h>

namespace ML {

Expand Down Expand Up @@ -167,7 +168,7 @@ class TSNE_runner {
{
distance_and_perplexity();

const auto NNZ = COO_Matrix.nnz;
const auto NNZ = (value_idx)COO_Matrix.nnz;
viclafargue marked this conversation as resolved.
Show resolved Hide resolved
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggested change
const auto NNZ = (value_idx)COO_Matrix.nnz;
const auto NNZ = value_idx(COO_Matrix.nnz);

Let's not let it prevent merge, but if we need to make further changes, it would be good to avoid raw casts.

auto* VAL = COO_Matrix.vals();
const auto* COL = COO_Matrix.cols();
const auto* ROW = COO_Matrix.rows();
Expand Down
32 changes: 18 additions & 14 deletions cpp/src/umap/fuzzy_simpl_set/naive.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@

#include <cuda_runtime.h>

#include <stdint.h>
#include <stdio.h>

#include <string>
Expand Down Expand Up @@ -92,7 +93,8 @@ CUML_KERNEL void smooth_knn_dist_kernel(const value_t* knn_dists,
{
// row-based matrix 1 thread per row
int row = (blockIdx.x * TPB_X) + threadIdx.x;
int i = row * n_neighbors; // each thread processes one row of the dist matrix
uint64_t i =
static_cast<uint64_t>(row) * n_neighbors; // each thread processes one row of the dist matrix

if (row < n) {
float target = __log2f(n_neighbors) * bandwidth;
Expand Down Expand Up @@ -190,7 +192,7 @@ CUML_KERNEL void smooth_knn_dist_kernel(const value_t* knn_dists,
*
* Descriptions adapted from: https://github.com/lmcinnes/umap/blob/master/umap/umap_.py
*/
template <int TPB_X, typename value_idx, typename value_t>
template <uint64_t TPB_X, typename value_idx, typename value_t>
CUML_KERNEL void compute_membership_strength_kernel(
const value_idx* knn_indices,
const float* knn_dists, // nn outputs
Expand All @@ -199,14 +201,14 @@ CUML_KERNEL void compute_membership_strength_kernel(
value_t* vals,
int* rows,
int* cols, // result coo
int n,
int n_neighbors)
int n_neighbors,
uint64_t to_process)
{ // model params

// row-based matrix is best
int idx = (blockIdx.x * TPB_X) + threadIdx.x;
uint64_t idx = (blockIdx.x * TPB_X) + threadIdx.x;

if (idx < n * n_neighbors) {
if (idx < to_process) {
int row = idx / n_neighbors; // one neighbor per thread

double cur_rho = rhos[row];
Expand Down Expand Up @@ -237,8 +239,8 @@ CUML_KERNEL void compute_membership_strength_kernel(
/*
* Sets up and runs the knn dist smoothing
*/
template <int TPB_X, typename value_idx, typename value_t>
void smooth_knn_dist(int n,
template <uint64_t TPB_X, typename value_idx, typename value_t>
void smooth_knn_dist(uint64_t n,
const value_idx* knn_indices,
const float* knn_dists,
value_t* rhos,
Expand All @@ -253,7 +255,8 @@ void smooth_knn_dist(int n,

rmm::device_uvector<value_t> dist_means_dev(n_neighbors, stream);

raft::stats::mean(dist_means_dev.data(), knn_dists, 1, n_neighbors * n, false, false, stream);
raft::stats::mean(
dist_means_dev.data(), knn_dists, uint64_t{1}, n * n_neighbors, false, false, stream);
RAFT_CUDA_TRY(cudaPeekAtLastError());

value_t mean_dist = 0.0;
Expand Down Expand Up @@ -284,8 +287,8 @@ void smooth_knn_dist(int n,
* @param params UMAPParams config object
* @param stream cuda stream to use for device operations
*/
template <int TPB_X, typename value_idx, typename value_t>
void launcher(int n,
template <uint64_t TPB_X, typename value_idx, typename value_t>
void launcher(uint64_t n,
const value_idx* knn_indices,
const value_t* knn_dists,
int n_neighbors,
Expand Down Expand Up @@ -328,7 +331,8 @@ void launcher(int n,
* Compute graph of membership strengths
*/

dim3 grid_elm(raft::ceildiv(n * n_neighbors, TPB_X), 1, 1);
uint64_t to_process = static_cast<uint64_t>(in.n_rows) * n_neighbors;
dim3 grid_elm(raft::ceildiv(to_process, TPB_X), 1, 1);
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I believer there's a chance for an overflow here. If to_process is large enough, we'd get something that would overflow a 32-bit integer, right? We also probably want to limit this to the maximum number of blocks (65535) anyway.

dim3 blk_elm(TPB_X, 1, 1);

compute_membership_strength_kernel<TPB_X><<<grid_elm, blk_elm, 0, stream>>>(knn_indices,
Expand All @@ -338,8 +342,8 @@ void launcher(int n,
in.vals(),
in.rows(),
in.cols(),
in.n_rows,
n_neighbors);
n_neighbors,
to_process);
RAFT_CUDA_TRY(cudaPeekAtLastError());

if (ML::default_logger().should_log(ML::level_enum::debug)) {
Expand Down
6 changes: 4 additions & 2 deletions cpp/src/umap/init_embed/random_algo.cuh
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2019-2024, NVIDIA CORPORATION.
* Copyright (c) 2019-2025, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand All @@ -20,14 +20,16 @@

#include <raft/random/rng.cuh>

#include <stdint.h>

namespace UMAPAlgo {
namespace InitEmbed {
namespace RandomInit {

using namespace ML;

template <typename T>
void launcher(int n, int d, UMAPParams* params, T* embedding, cudaStream_t stream)
void launcher(uint64_t n, int d, UMAPParams* params, T* embedding, cudaStream_t stream)
{
uint64_t seed = params->random_state;

Expand Down
9 changes: 6 additions & 3 deletions cpp/src/umap/init_embed/spectral_algo.cuh
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2019-2024, NVIDIA CORPORATION.
* Copyright (c) 2019-2025, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand Down Expand Up @@ -29,6 +29,8 @@
#include <thrust/execution_policy.h>
#include <thrust/extrema.h>

#include <stdint.h>

#include <iostream>

namespace UMAPAlgo {
Expand All @@ -44,15 +46,16 @@ using namespace ML;
*/
template <typename T>
void launcher(const raft::handle_t& handle,
int n,
uint64_t n,
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Use idx_t

int d,
raft::sparse::COO<float>* coo,
UMAPParams* params,
T* embedding)
{
cudaStream_t stream = handle.get_stream();

ASSERT(n > params->n_components, "Spectral layout requires n_samples > n_components");
ASSERT(n > static_cast<uint64_t>(params->n_components),
"Spectral layout requires n_samples > n_components");

rmm::device_uvector<T> tmp_storage(n * params->n_components, stream);

Expand Down
10 changes: 6 additions & 4 deletions cpp/src/umap/knn_graph/algo.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@

#include <cuvs/distance/distance.hpp>
#include <cuvs/neighbors/brute_force.hpp>
#include <stdint.h>

#include <iostream>

Expand Down Expand Up @@ -138,9 +139,10 @@ inline void launcher(const raft::handle_t& handle,
target[i * n_neighbors + j] = source[i * graph_degree + j];
}
}
raft::copy(handle,
raft::make_device_matrix_view(out.knn_indices, inputsA.n, n_neighbors),
temp_indices_h.view());
raft::copy(
handle,
raft::make_device_matrix_view(out.knn_indices, inputsA.n, static_cast<uint64_t>(n_neighbors)),
temp_indices_h.view());

// `graph.distances()` is a device array (n x graph_degree).
// Slice and copy to the output device array `out.knn_dists` (n x n_neighbors).
Expand All @@ -152,7 +154,7 @@ inline void launcher(const raft::handle_t& handle,
raft::matrix::slice<float, int64_t, raft::row_major>(
handle,
raft::make_const_mdspan(graph.distances().value()),
raft::make_device_matrix_view(out.knn_dists, inputsA.n, n_neighbors),
raft::make_device_matrix_view(out.knn_dists, inputsA.n, static_cast<uint64_t>(n_neighbors)),
coords);
}
}
Expand Down
18 changes: 8 additions & 10 deletions cpp/src/umap/runner.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,8 @@
#include <thrust/scan.h>
#include <thrust/system/cuda/execution_policy.h>

#include <stdint.h>

#include <memory>

namespace UMAPAlgo {
Expand Down Expand Up @@ -348,7 +350,7 @@ void _fit_supervised(const raft::handle_t& handle,
/**
*
*/
template <typename value_idx, typename value_t, typename umap_inputs, int TPB_X>
template <typename value_idx, typename value_t, typename umap_inputs, uint64_t TPB_X>
void _transform(const raft::handle_t& handle,
const umap_inputs& inputs,
umap_inputs& orig_x_inputs,
Expand Down Expand Up @@ -425,7 +427,7 @@ void _transform(const raft::handle_t& handle,
* Compute graph of membership strengths
*/

int nnz = inputs.n * params->n_neighbors;
uint64_t nnz = uint64_t{inputs.n} * params->n_neighbors;
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Please don't hardcode these types. Create an nnz_t.


dim3 grid_nnz(raft::ceildiv(nnz, TPB_X), 1, 1);

Expand All @@ -437,6 +439,7 @@ void _transform(const raft::handle_t& handle,

raft::sparse::COO<value_t> graph_coo(stream, nnz, inputs.n, inputs.n);

uint64_t to_process = static_cast<uint64_t>(graph_coo.n_rows) * params->n_neighbors;
FuzzySimplSetImpl::compute_membership_strength_kernel<TPB_X>
<<<grid_nnz, blk, 0, stream>>>(knn_graph.knn_indices,
knn_graph.knn_dists,
Expand All @@ -445,15 +448,13 @@ void _transform(const raft::handle_t& handle,
graph_coo.vals(),
graph_coo.rows(),
graph_coo.cols(),
graph_coo.n_rows,
params->n_neighbors);
params->n_neighbors,
to_process);
RAFT_CUDA_TRY(cudaPeekAtLastError());

rmm::device_uvector<int> row_ind(inputs.n, stream);
rmm::device_uvector<int> ia(inputs.n, stream);
rmm::device_uvector<uint64_t> row_ind(inputs.n, stream);

raft::sparse::convert::sorted_coo_to_csr(&graph_coo, row_ind.data(), stream);
raft::sparse::linalg::coo_degree(&graph_coo, ia.data(), stream);
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

So we never needed the degree here in the first place?


rmm::device_uvector<value_t> vals_normed(graph_coo.nnz, stream);
RAFT_CUDA_TRY(cudaMemsetAsync(vals_normed.data(), 0, graph_coo.nnz * sizeof(value_t), stream));
Expand All @@ -471,9 +472,6 @@ void _transform(const raft::handle_t& handle,
params->n_components,
transformed,
params->n_neighbors);
RAFT_CUDA_TRY(cudaPeekAtLastError());

RAFT_CUDA_TRY(cudaMemsetAsync(ia.data(), 0.0, ia.size() * sizeof(int), stream));

RAFT_CUDA_TRY(cudaPeekAtLastError());

Expand Down
Loading