Skip to content

Commit

Permalink
progress so far
Browse files Browse the repository at this point in the history
  • Loading branch information
viclafargue committed Jan 16, 2025
1 parent 2e06ab9 commit dc1ca34
Show file tree
Hide file tree
Showing 6 changed files with 68 additions and 61 deletions.
12 changes: 6 additions & 6 deletions cpp/include/raft/linalg/detail/transpose.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@ namespace raft {
namespace linalg {
namespace detail {

template <typename IndexType, int TILE_DIM, int BLOCK_ROWS>
template <typename IndexType, uint64_t TILE_DIM, uint64_t BLOCK_ROWS>
RAFT_KERNEL transpose_half_kernel(IndexType n_rows,
IndexType n_cols,
const half* __restrict__ in,
Expand All @@ -44,10 +44,10 @@ RAFT_KERNEL transpose_half_kernel(IndexType n_rows,
{
__shared__ half tile[TILE_DIM][TILE_DIM + 1];

for (int block_offset_y = 0; block_offset_y < n_rows; block_offset_y += gridDim.y * TILE_DIM) {
for (int block_offset_x = 0; block_offset_x < n_cols; block_offset_x += gridDim.x * TILE_DIM) {
auto x = block_offset_x + blockIdx.x * TILE_DIM + threadIdx.x;
auto y = block_offset_y + blockIdx.y * TILE_DIM + threadIdx.y;
for (uint64_t block_offset_y = 0; block_offset_y < n_rows; block_offset_y += gridDim.y * TILE_DIM) {
for (uint64_t block_offset_x = 0; block_offset_x < n_cols; block_offset_x += gridDim.x * TILE_DIM) {
uint64_t x = block_offset_x + blockIdx.x * TILE_DIM + threadIdx.x;
uint64_t y = block_offset_y + blockIdx.y * TILE_DIM + threadIdx.y;

for (int j = 0; j < TILE_DIM; j += BLOCK_ROWS) {
if (x < n_cols && (y + j) < n_rows) {
Expand Down Expand Up @@ -117,7 +117,7 @@ void transpose_half(raft::resources const& handle,
int num_blocks = max_active_blocks * sm_count;

int grid_x = (n_cols + block_dim_x - 1) / block_dim_x;
int grid_y = (n_rows + block_dim_x - 1) / block_dim_x;
int grid_y = ((uint64_t)n_rows + block_dim_x - 1) / block_dim_x;

float ratio = static_cast<float>(grid_y) / static_cast<float>(grid_x);
int adjusted_grid_y =
Expand Down
4 changes: 2 additions & 2 deletions cpp/include/raft/sparse/linalg/detail/spectral.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -35,8 +35,8 @@ void fit_embedding(raft::resources const& handle,
int* rows,
int* cols,
T* vals,
int nnz,
int n,
uint64_t nnz,
uint64_t n,
int n_components,
T* out,
unsigned long long seed = 1234567)
Expand Down
2 changes: 1 addition & 1 deletion cpp/include/raft/sparse/linalg/detail/symmetrize.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -104,7 +104,7 @@ RAFT_KERNEL coo_symmetrize_kernel(uint64_t* row_ind,
// Note that if we did find a match, we don't need to
// compute `res` on it here because it will be computed
// in a different thread.
if (!found_match && vals[idx] != 0.0) {
if (!found_match && cur_val != 0.0) {
orows[out_start_idx + row_nnz] = cur_col;
ocols[out_start_idx + row_nnz] = cur_row;
ovals[out_start_idx + row_nnz] = res;
Expand Down
4 changes: 2 additions & 2 deletions cpp/include/raft/sparse/linalg/spectral.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ void fit_embedding(raft::resources const& handle,
int* rows,
int* cols,
T* vals,
int nnz,
uint64_t nnz,
int n,
int n_components,
T* out,
Expand All @@ -40,4 +40,4 @@ void fit_embedding(raft::resources const& handle,
}; // namespace sparse
}; // namespace raft

#endif
#endif
101 changes: 54 additions & 47 deletions cpp/include/raft/sparse/op/detail/filter.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -43,34 +43,42 @@ namespace op {
namespace detail {

template <uint64_t TPB_X, typename T>
RAFT_KERNEL coo_remove_scalar_kernel(const int* rows,
const int* cols,
const T* vals,
RAFT_KERNEL coo_remove_scalar_kernel(const int* in_rows,
const int* in_cols,
const T* in_vals,
uint64_t nnz,
int* crows,
int* ccols,
T* cvals,
uint64_t* ex_scan,
uint64_t* cur_ex_scan,
int m,
T scalar)
int* out_rows,
int* out_cols,
T* out_vals,
uint64_t* row_indices,
int* rows_lenght_acc,
T scalar,
int n_rows)
{
int row = (blockIdx.x * TPB_X) + threadIdx.x;

if (row < m) {
uint64_t start = cur_ex_scan[row];
uint64_t stop = get_stop_idx(row, m, nnz, cur_ex_scan);
uint64_t cur_out_idx = ex_scan[row];

for (uint64_t idx = start; idx < stop; idx++) {
if (vals[idx] != scalar) {
crows[cur_out_idx] = rows[idx];
ccols[cur_out_idx] = cols[idx];
cvals[cur_out_idx] = vals[idx];
++cur_out_idx;
}
}
}
uint64_t in_idx = (blockIdx.x * TPB_X) + threadIdx.x;

if (in_idx >= nnz)
return;

int val = in_vals[in_idx];

if (val == scalar)
return;

int row = in_rows[in_idx];

if (row >= n_rows)
printf("row >= n_rows : %d\n", row);

uint64_t row_start_index = row_indices[row];
uint64_t out_idx = row_start_index + atomicAdd(rows_lenght_acc + row, 1);

if (out_idx >= nnz)
printf("out_idx >= nnz: %lu\n", out_idx);

out_rows[out_idx] = row;
out_cols[out_idx] = in_cols[in_idx];
out_vals[out_idx] = val;
}

/**
Expand All @@ -90,7 +98,7 @@ RAFT_KERNEL coo_remove_scalar_kernel(const int* rows,
* @param d_alloc device allocator for temporary buffers
* @param stream: cuda stream to use
*/
template <int TPB_X, typename T>
template <uint64_t TPB_X, typename T>
void coo_remove_scalar(const int* rows,
const int* cols,
const T* vals,
Expand All @@ -99,28 +107,31 @@ void coo_remove_scalar(const int* rows,
int* ccols,
T* cvals,
int* cnnz,
int* cur_cnnz,
T scalar,
int n,
cudaStream_t stream)
{
rmm::device_uvector<uint64_t> ex_scan(n, stream);
rmm::device_uvector<uint64_t> cur_ex_scan(n, stream);

RAFT_CUDA_TRY(cudaMemsetAsync(ex_scan.data(), 0, (uint64_t)n * sizeof(int), stream));
RAFT_CUDA_TRY(cudaMemsetAsync(cur_ex_scan.data(), 0, (uint64_t)n * sizeof(int), stream));
RAFT_CUDA_TRY(cudaMemsetAsync(ex_scan.data(), 0, (uint64_t)n * sizeof(uint64_t), stream));

thrust::device_ptr<int> dev_cnnz = thrust::device_pointer_cast(cnnz);
thrust::device_ptr<uint64_t> dev_ex_scan = thrust::device_pointer_cast(ex_scan.data());
thrust::exclusive_scan(rmm::exec_policy(stream), dev_cnnz, dev_cnnz + n, dev_ex_scan);
RAFT_CUDA_TRY(cudaPeekAtLastError());

thrust::device_ptr<int> dev_cur_cnnz = thrust::device_pointer_cast(cur_cnnz);
thrust::device_ptr<uint64_t> dev_cur_ex_scan = thrust::device_pointer_cast(cur_ex_scan.data());
thrust::exclusive_scan(rmm::exec_policy(stream), dev_cur_cnnz, dev_cur_cnnz + n, dev_cur_ex_scan);
RAFT_CUDA_TRY(cudaPeekAtLastError());
raft::print_device_vector("ex_scan", ex_scan.data(), 20, std::cout);

auto maxiter_ex_scan = thrust::max_element(dev_ex_scan, dev_ex_scan + n);
uint64_t max_ex_scan = *maxiter_ex_scan;

std::cout << "max_ex_scan: " << max_ex_scan << std::endl;
std::cout << "nnz: " << nnz << std::endl;
std::cout << "n: " << n << std::endl;

rmm::device_uvector<int> rows_length_acc(n, stream);
RAFT_CUDA_TRY(cudaMemsetAsync(rows_length_acc.data(), 0, (uint64_t)n * sizeof(int), stream));

dim3 grid(raft::ceildiv(n, TPB_X), 1, 1);
dim3 grid(raft::ceildiv(nnz, TPB_X), 1, 1);
dim3 blk(TPB_X, 1, 1);

coo_remove_scalar_kernel<TPB_X><<<grid, blk, 0, stream>>>(rows,
Expand All @@ -131,9 +142,9 @@ void coo_remove_scalar(const int* rows,
ccols,
cvals,
dev_ex_scan.get(),
dev_cur_ex_scan.get(),
n,
scalar);
rows_length_acc.data(),
scalar,
n);
RAFT_CUDA_TRY(cudaPeekAtLastError());
}

Expand All @@ -149,20 +160,17 @@ template <int TPB_X, typename T>
void coo_remove_scalar(COO<T>* in, COO<T>* out, T scalar, cudaStream_t stream)
{
rmm::device_uvector<int> row_count_nz(in->n_rows, stream);
rmm::device_uvector<int> row_count(in->n_rows, stream);

RAFT_CUDA_TRY(cudaMemsetAsync(row_count_nz.data(), 0, (uint64_t)in->n_rows * sizeof(int), stream));
RAFT_CUDA_TRY(cudaMemsetAsync(row_count.data(), 0, (uint64_t)in->n_rows * sizeof(int), stream));

linalg::coo_degree(in->rows(), in->nnz, row_count.data(), stream);
RAFT_CUDA_TRY(cudaPeekAtLastError());

linalg::coo_degree_scalar(in->rows(), in->vals(), in->nnz, scalar, row_count_nz.data(), stream);
RAFT_CUDA_TRY(cudaPeekAtLastError());

thrust::device_ptr<int> d_row_count_nz = thrust::device_pointer_cast(row_count_nz.data());
uint64_t out_nnz = thrust::reduce(rmm::exec_policy(stream), d_row_count_nz, d_row_count_nz + in->n_rows, (uint64_t)0);

raft::print_device_vector("row_count_nz", row_count_nz.data(), 20, std::cout);
std::cout << "!!! out_nnz: " << out_nnz << std::endl;

out->allocate(out_nnz, in->n_rows, in->n_cols, false, stream);

coo_remove_scalar<TPB_X, T>(in->rows(),
Expand All @@ -173,7 +181,6 @@ void coo_remove_scalar(COO<T>* in, COO<T>* out, T scalar, cudaStream_t stream)
out->cols(),
out->vals(),
row_count_nz.data(),
row_count.data(),
scalar,
in->n_rows,
stream);
Expand Down
6 changes: 3 additions & 3 deletions cpp/include/raft/sparse/solver/detail/lanczos.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -151,7 +151,7 @@ int performLanczosIteration(raft::resources const& handle,

RAFT_EXPECTS(A != nullptr, "Null matrix pointer.");

index_type_t n = A->nrows_;
uint64_t n = A->nrows_;

// -------------------------------------------------------
// Compute second Lanczos vector
Expand Down Expand Up @@ -814,7 +814,7 @@ int computeSmallestEigenvectors(
constexpr value_type_t zero = 0;

// Matrix dimension
index_type_t n = A->nrows_;
uint64_t n = A->nrows_;

// Shift for implicit restart
value_type_t shiftUpper;
Expand Down Expand Up @@ -1160,7 +1160,7 @@ int computeLargestEigenvectors(
constexpr value_type_t zero = 0;

// Matrix dimension
index_type_t n = A->nrows_;
uint64_t n = A->nrows_;

// Lanczos iteration counters
index_type_t maxIter_curr = restartIter; // Maximum size of Lanczos system
Expand Down

0 comments on commit dc1ca34

Please sign in to comment.