Skip to content

Commit

Permalink
Fixed J copy bug & reverted the col_swap change
Browse files Browse the repository at this point in the history
  • Loading branch information
TeachRaccooon committed Aug 21, 2024
1 parent 815530d commit f1e2299
Show file tree
Hide file tree
Showing 3 changed files with 11 additions and 11 deletions.
2 changes: 1 addition & 1 deletion RandLAPACK/drivers/rl_cqrrp_gpu.hh
Original file line number Diff line number Diff line change
Expand Up @@ -610,7 +610,7 @@ int CQRRP_blocked_GPU<T, RNG>::call(
J_copy_col_swap = J;
J = J_cpy_buf;
}
for(int odd_idx = 1; odd_idx < iter; odd_idx += 2) {
for(int odd_idx = 1; odd_idx <= iter; odd_idx += 2) {
RandLAPACK::cuda_kernels::copy_gpu(strm, b_sz_const, &J_copy_col_swap[odd_idx * b_sz_const], 1, &J[odd_idx * b_sz_const], 1);
}
lapack_queue.sync();
Expand Down
18 changes: 10 additions & 8 deletions RandLAPACK/gpu_functions/rl_cuda_kernels.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -318,20 +318,21 @@ __global__ void col_swap_gpu_kernel(
int64_t ldac,
int64_t const* idx
) {

/*
for (int64_t i = threadIdx.x + blockDim.x * blockIdx.x; i < k; i += blockDim.x * gridDim.x) {
for (int64_t j = threadIdx.y + blockDim.y * blockIdx.y; i < m; i += blockDim.y * gridDim.y) {
int64_t l = idx[i] - 1;
A[i * lda + j] = A_cpy[l * ldac + j];
}
}
*/

// int colIdx = blockIdx.x * blockDim.x + threadIdx.x;
// int rowIdx = blockIdx.y * blockDim.y + threadIdx.y;
// if (colIdx < k && rowIdx < m) {
// int64_t j = idx[colIdx] - 1;
// A[colIdx * lda + rowIdx] = A_cpy[j * ldac + rowIdx];
// }
int colIdx = blockIdx.x * blockDim.x + threadIdx.x;
int rowIdx = blockIdx.y * blockDim.y + threadIdx.y;
if (colIdx < k && rowIdx < m) {
int64_t j = idx[colIdx] - 1;
A[colIdx * lda + rowIdx] = A_cpy[j * ldac + rowIdx];
}
}

template <typename T>
Expand Down Expand Up @@ -469,7 +470,8 @@ void copy_mat_gpu(
bool copy_upper_triangle
) {
#ifdef USE_CUDA
dim3 threadsPerBlock(32, 8);
//dim3 threadsPerBlock(32, 8);
dim3 threadsPerBlock(11, 11);
dim3 numBlocks((n + threadsPerBlock.x - 1) / threadsPerBlock.x,
(m + threadsPerBlock.y - 1) / threadsPerBlock.y);
copy_mat_gpu<<<numBlocks, threadsPerBlock, 0, stream>>>(m, n, A, lda, A_cpy, ldat, copy_upper_triangle);
Expand Down
2 changes: 0 additions & 2 deletions test/drivers/test_cqrrp_gpu.cu
Original file line number Diff line number Diff line change
Expand Up @@ -217,7 +217,6 @@ class TestCQRRP : public ::testing::TestWithParam<int64_t>
auto n = all_data.col;

CQRRP_GPU.call(m, n, all_data.A_device, m, all_data.A_sk_device, d, all_data.tau_device, all_data.J_device);
/*
all_data.rank = CQRRP_GPU.rank;
printf("RANK AS RETURNED BY CQRRP GPU %4ld\n", all_data.rank);

Expand All @@ -234,7 +233,6 @@ class TestCQRRP : public ::testing::TestWithParam<int64_t>
RandLAPACK::util::col_swap(m, n, n, all_data.A_cpy2.data(), m, all_data.J);

error_check(norm_A, all_data);
*/
}

template <typename T, typename RNG, typename alg_gpu, typename alg_cpu>
Expand Down

0 comments on commit f1e2299

Please sign in to comment.