-
Notifications
You must be signed in to change notification settings - Fork 549
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Fix UMAP issues with large inputs #6245
base: branch-25.04
Are you sure you want to change the base?
Changes from all commits
58dea2c
ea9a476
c8db94b
fb26681
038f31e
fb70daf
ca86392
72a83ab
a7134dd
9abe842
6ff906b
5650fc1
2e2bfbf
3151144
e013cb7
cb80c5b
72361ec
edfb14b
7a7229b
2c46ad1
c51a1e9
53d276c
0aaeac8
f7ce445
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -20,4 +20,4 @@ c_stdlib_version: | |
- "=2.28" | ||
|
||
treelite_version: | ||
- "=4.4.1" | ||
- "=4.3.0" |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -73,8 +73,8 @@ endfunction() | |
# To use a different CUVS locally, set the CMake variable | ||
# CPM_cuvs_SOURCE=/path/to/local/cuvs | ||
find_and_configure_cuvs(VERSION ${CUML_MIN_VERSION_cuvs} | ||
FORK rapidsai | ||
PINNED_TAG branch-${CUML_BRANCH_VERSION_cuvs} | ||
FORK divyegala | ||
PINNED_TAG raft-sparse-updates | ||
Comment on lines
+76
to
+77
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Reminder to revert this before merge. |
||
EXCLUDE_FROM_ALL ${CUML_EXCLUDE_CUVS_FROM_ALL} | ||
# When PINNED_TAG above doesn't match cuml, | ||
# force local cuvs clone in build directory | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -72,8 +72,8 @@ endfunction() | |
# To use a different RAFT locally, set the CMake variable | ||
# CPM_raft_SOURCE=/path/to/local/raft | ||
find_and_configure_raft(VERSION ${CUML_MIN_VERSION_raft} | ||
FORK rapidsai | ||
PINNED_TAG branch-${CUML_BRANCH_VERSION_raft} | ||
FORK viclafargue | ||
PINNED_TAG fix-sparse-utilities | ||
Comment on lines
+75
to
+76
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Reminder to revert this before merge. |
||
EXCLUDE_FROM_ALL ${CUML_EXCLUDE_RAFT_FROM_ALL} | ||
# When PINNED_TAG above doesn't match cuml, | ||
# force local raft clone in build directory | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -79,7 +79,7 @@ function(find_and_configure_treelite) | |
rapids_export_find_package_root(BUILD Treelite [=[${CMAKE_CURRENT_LIST_DIR}]=] EXPORT_SET cuml-exports) | ||
endfunction() | ||
|
||
find_and_configure_treelite(VERSION 4.4.1 | ||
PINNED_TAG 386bd0de99f5a66584c7e58221ee38ce606ad1ae | ||
find_and_configure_treelite(VERSION 4.3.0 | ||
PINNED_TAG 575e4208f2b18e40d818c338ecb95d7a26e69aab | ||
Comment on lines
+82
to
+83
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. What is the motivation for the downgrade? |
||
EXCLUDE_FROM_ALL ${CUML_EXCLUDE_TREELITE_FROM_ALL} | ||
BUILD_STATIC_LIBS ${CUML_USE_TREELITE_STATIC}) |
Original file line number | Diff line number | Diff line change | ||||
---|---|---|---|---|---|---|
|
@@ -37,6 +37,7 @@ | |||||
|
||||||
#include <cuvs/distance/distance.hpp> | ||||||
#include <pca/pca.cuh> | ||||||
#include <stdint.h> | ||||||
|
||||||
namespace ML { | ||||||
|
||||||
|
@@ -167,7 +168,7 @@ class TSNE_runner { | |||||
{ | ||||||
distance_and_perplexity(); | ||||||
|
||||||
const auto NNZ = COO_Matrix.nnz; | ||||||
const auto NNZ = (value_idx)COO_Matrix.nnz; | ||||||
viclafargue marked this conversation as resolved.
Show resolved
Hide resolved
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Suggested change
Let's not let it prevent merge, but if we need to make further changes, it would be good to avoid raw casts. |
||||||
auto* VAL = COO_Matrix.vals(); | ||||||
const auto* COL = COO_Matrix.cols(); | ||||||
const auto* ROW = COO_Matrix.rows(); | ||||||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -30,6 +30,7 @@ | |
|
||
#include <cuda_runtime.h> | ||
|
||
#include <stdint.h> | ||
#include <stdio.h> | ||
|
||
#include <string> | ||
|
@@ -92,7 +93,8 @@ CUML_KERNEL void smooth_knn_dist_kernel(const value_t* knn_dists, | |
{ | ||
// row-based matrix 1 thread per row | ||
int row = (blockIdx.x * TPB_X) + threadIdx.x; | ||
int i = row * n_neighbors; // each thread processes one row of the dist matrix | ||
uint64_t i = | ||
static_cast<uint64_t>(row) * n_neighbors; // each thread processes one row of the dist matrix | ||
|
||
if (row < n) { | ||
float target = __log2f(n_neighbors) * bandwidth; | ||
|
@@ -190,7 +192,7 @@ CUML_KERNEL void smooth_knn_dist_kernel(const value_t* knn_dists, | |
* | ||
* Descriptions adapted from: https://github.com/lmcinnes/umap/blob/master/umap/umap_.py | ||
*/ | ||
template <int TPB_X, typename value_idx, typename value_t> | ||
template <uint64_t TPB_X, typename value_idx, typename value_t> | ||
CUML_KERNEL void compute_membership_strength_kernel( | ||
const value_idx* knn_indices, | ||
const float* knn_dists, // nn outputs | ||
|
@@ -199,14 +201,14 @@ CUML_KERNEL void compute_membership_strength_kernel( | |
value_t* vals, | ||
int* rows, | ||
int* cols, // result coo | ||
int n, | ||
int n_neighbors) | ||
int n_neighbors, | ||
uint64_t to_process) | ||
{ // model params | ||
|
||
// row-based matrix is best | ||
int idx = (blockIdx.x * TPB_X) + threadIdx.x; | ||
uint64_t idx = (blockIdx.x * TPB_X) + threadIdx.x; | ||
|
||
if (idx < n * n_neighbors) { | ||
if (idx < to_process) { | ||
int row = idx / n_neighbors; // one neighbor per thread | ||
|
||
double cur_rho = rhos[row]; | ||
|
@@ -237,8 +239,8 @@ CUML_KERNEL void compute_membership_strength_kernel( | |
/* | ||
* Sets up and runs the knn dist smoothing | ||
*/ | ||
template <int TPB_X, typename value_idx, typename value_t> | ||
void smooth_knn_dist(int n, | ||
template <uint64_t TPB_X, typename value_idx, typename value_t> | ||
void smooth_knn_dist(uint64_t n, | ||
const value_idx* knn_indices, | ||
const float* knn_dists, | ||
value_t* rhos, | ||
|
@@ -253,7 +255,8 @@ void smooth_knn_dist(int n, | |
|
||
rmm::device_uvector<value_t> dist_means_dev(n_neighbors, stream); | ||
|
||
raft::stats::mean(dist_means_dev.data(), knn_dists, 1, n_neighbors * n, false, false, stream); | ||
raft::stats::mean( | ||
dist_means_dev.data(), knn_dists, uint64_t{1}, n * n_neighbors, false, false, stream); | ||
RAFT_CUDA_TRY(cudaPeekAtLastError()); | ||
|
||
value_t mean_dist = 0.0; | ||
|
@@ -284,8 +287,8 @@ void smooth_knn_dist(int n, | |
* @param params UMAPParams config object | ||
* @param stream cuda stream to use for device operations | ||
*/ | ||
template <int TPB_X, typename value_idx, typename value_t> | ||
void launcher(int n, | ||
template <uint64_t TPB_X, typename value_idx, typename value_t> | ||
void launcher(uint64_t n, | ||
const value_idx* knn_indices, | ||
const value_t* knn_dists, | ||
int n_neighbors, | ||
|
@@ -328,7 +331,8 @@ void launcher(int n, | |
* Compute graph of membership strengths | ||
*/ | ||
|
||
dim3 grid_elm(raft::ceildiv(n * n_neighbors, TPB_X), 1, 1); | ||
uint64_t to_process = static_cast<uint64_t>(in.n_rows) * n_neighbors; | ||
dim3 grid_elm(raft::ceildiv(to_process, TPB_X), 1, 1); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I believer there's a chance for an overflow here. If |
||
dim3 blk_elm(TPB_X, 1, 1); | ||
|
||
compute_membership_strength_kernel<TPB_X><<<grid_elm, blk_elm, 0, stream>>>(knn_indices, | ||
|
@@ -338,8 +342,8 @@ void launcher(int n, | |
in.vals(), | ||
in.rows(), | ||
in.cols(), | ||
in.n_rows, | ||
n_neighbors); | ||
n_neighbors, | ||
to_process); | ||
RAFT_CUDA_TRY(cudaPeekAtLastError()); | ||
|
||
if (ML::default_logger().should_log(ML::level_enum::debug)) { | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,5 +1,5 @@ | ||
/* | ||
* Copyright (c) 2019-2024, NVIDIA CORPORATION. | ||
* Copyright (c) 2019-2025, NVIDIA CORPORATION. | ||
* | ||
* Licensed under the Apache License, Version 2.0 (the "License"); | ||
* you may not use this file except in compliance with the License. | ||
|
@@ -29,6 +29,8 @@ | |
#include <thrust/execution_policy.h> | ||
#include <thrust/extrema.h> | ||
|
||
#include <stdint.h> | ||
|
||
#include <iostream> | ||
|
||
namespace UMAPAlgo { | ||
|
@@ -44,15 +46,16 @@ using namespace ML; | |
*/ | ||
template <typename T> | ||
void launcher(const raft::handle_t& handle, | ||
int n, | ||
uint64_t n, | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Use |
||
int d, | ||
raft::sparse::COO<float>* coo, | ||
UMAPParams* params, | ||
T* embedding) | ||
{ | ||
cudaStream_t stream = handle.get_stream(); | ||
|
||
ASSERT(n > params->n_components, "Spectral layout requires n_samples > n_components"); | ||
ASSERT(n > static_cast<uint64_t>(params->n_components), | ||
"Spectral layout requires n_samples > n_components"); | ||
|
||
rmm::device_uvector<T> tmp_storage(n * params->n_components, stream); | ||
|
||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -47,6 +47,8 @@ | |
#include <thrust/scan.h> | ||
#include <thrust/system/cuda/execution_policy.h> | ||
|
||
#include <stdint.h> | ||
|
||
#include <memory> | ||
|
||
namespace UMAPAlgo { | ||
|
@@ -348,7 +350,7 @@ void _fit_supervised(const raft::handle_t& handle, | |
/** | ||
* | ||
*/ | ||
template <typename value_idx, typename value_t, typename umap_inputs, int TPB_X> | ||
template <typename value_idx, typename value_t, typename umap_inputs, uint64_t TPB_X> | ||
void _transform(const raft::handle_t& handle, | ||
const umap_inputs& inputs, | ||
umap_inputs& orig_x_inputs, | ||
|
@@ -425,7 +427,7 @@ void _transform(const raft::handle_t& handle, | |
* Compute graph of membership strengths | ||
*/ | ||
|
||
int nnz = inputs.n * params->n_neighbors; | ||
uint64_t nnz = uint64_t{inputs.n} * params->n_neighbors; | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Please don't hardcode these types. Create an |
||
|
||
dim3 grid_nnz(raft::ceildiv(nnz, TPB_X), 1, 1); | ||
|
||
|
@@ -437,6 +439,7 @@ void _transform(const raft::handle_t& handle, | |
|
||
raft::sparse::COO<value_t> graph_coo(stream, nnz, inputs.n, inputs.n); | ||
|
||
uint64_t to_process = static_cast<uint64_t>(graph_coo.n_rows) * params->n_neighbors; | ||
FuzzySimplSetImpl::compute_membership_strength_kernel<TPB_X> | ||
<<<grid_nnz, blk, 0, stream>>>(knn_graph.knn_indices, | ||
knn_graph.knn_dists, | ||
|
@@ -445,15 +448,13 @@ void _transform(const raft::handle_t& handle, | |
graph_coo.vals(), | ||
graph_coo.rows(), | ||
graph_coo.cols(), | ||
graph_coo.n_rows, | ||
params->n_neighbors); | ||
params->n_neighbors, | ||
to_process); | ||
RAFT_CUDA_TRY(cudaPeekAtLastError()); | ||
|
||
rmm::device_uvector<int> row_ind(inputs.n, stream); | ||
rmm::device_uvector<int> ia(inputs.n, stream); | ||
rmm::device_uvector<uint64_t> row_ind(inputs.n, stream); | ||
|
||
raft::sparse::convert::sorted_coo_to_csr(&graph_coo, row_ind.data(), stream); | ||
raft::sparse::linalg::coo_degree(&graph_coo, ia.data(), stream); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. So we never needed the degree here in the first place? |
||
|
||
rmm::device_uvector<value_t> vals_normed(graph_coo.nnz, stream); | ||
RAFT_CUDA_TRY(cudaMemsetAsync(vals_normed.data(), 0, graph_coo.nnz * sizeof(value_t), stream)); | ||
|
@@ -471,9 +472,6 @@ void _transform(const raft::handle_t& handle, | |
params->n_components, | ||
transformed, | ||
params->n_neighbors); | ||
RAFT_CUDA_TRY(cudaPeekAtLastError()); | ||
|
||
RAFT_CUDA_TRY(cudaMemsetAsync(ia.data(), 0.0, ia.size() * sizeof(int), stream)); | ||
|
||
RAFT_CUDA_TRY(cudaPeekAtLastError()); | ||
|
||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Just noting that we'll need to make sure to switch these back once the associated PRs have been merged so we don't forget.