From f1e229941a87c766a6ae7ee6fe608e3129697441 Mon Sep 17 00:00:00 2001 From: TeachRaccooon Date: Wed, 21 Aug 2024 12:40:02 -0700 Subject: [PATCH] Fixed J copy bug & reverted the col_swap change --- RandLAPACK/drivers/rl_cqrrp_gpu.hh | 2 +- RandLAPACK/gpu_functions/rl_cuda_kernels.cuh | 18 ++++++++++-------- test/drivers/test_cqrrp_gpu.cu | 2 -- 3 files changed, 11 insertions(+), 11 deletions(-) diff --git a/RandLAPACK/drivers/rl_cqrrp_gpu.hh b/RandLAPACK/drivers/rl_cqrrp_gpu.hh index 823528ea..32880b10 100644 --- a/RandLAPACK/drivers/rl_cqrrp_gpu.hh +++ b/RandLAPACK/drivers/rl_cqrrp_gpu.hh @@ -610,7 +610,7 @@ int CQRRP_blocked_GPU::call( J_copy_col_swap = J; J = J_cpy_buf; } - for(int odd_idx = 1; odd_idx < iter; odd_idx += 2) { + for(int odd_idx = 1; odd_idx <= iter; odd_idx += 2) { RandLAPACK::cuda_kernels::copy_gpu(strm, b_sz_const, &J_copy_col_swap[odd_idx * b_sz_const], 1, &J[odd_idx * b_sz_const], 1); } lapack_queue.sync(); diff --git a/RandLAPACK/gpu_functions/rl_cuda_kernels.cuh b/RandLAPACK/gpu_functions/rl_cuda_kernels.cuh index efbcb86a..7f02fcca 100644 --- a/RandLAPACK/gpu_functions/rl_cuda_kernels.cuh +++ b/RandLAPACK/gpu_functions/rl_cuda_kernels.cuh @@ -318,20 +318,21 @@ __global__ void col_swap_gpu_kernel( int64_t ldac, int64_t const* idx ) { - + /* for (int64_t i = threadIdx.x + blockDim.x * blockIdx.x; i < k; i += blockDim.x * gridDim.x) { for (int64_t j = threadIdx.y + blockDim.y * blockIdx.y; i < m; i += blockDim.y * gridDim.y) { int64_t l = idx[i] - 1; A[i * lda + j] = A_cpy[l * ldac + j]; } } + */ - // int colIdx = blockIdx.x * blockDim.x + threadIdx.x; - // int rowIdx = blockIdx.y * blockDim.y + threadIdx.y; - // if (colIdx < k && rowIdx < m) { - // int64_t j = idx[colIdx] - 1; - // A[colIdx * lda + rowIdx] = A_cpy[j * ldac + rowIdx]; - // } + int colIdx = blockIdx.x * blockDim.x + threadIdx.x; + int rowIdx = blockIdx.y * blockDim.y + threadIdx.y; + if (colIdx < k && rowIdx < m) { + int64_t j = idx[colIdx] - 1; + A[colIdx * lda + rowIdx] = A_cpy[j * ldac + rowIdx]; + } } template @@ -469,7 +470,8 @@ void copy_mat_gpu( bool copy_upper_triangle ) { #ifdef USE_CUDA - dim3 threadsPerBlock(32, 8); + //dim3 threadsPerBlock(32, 8); + dim3 threadsPerBlock(11, 11); dim3 numBlocks((n + threadsPerBlock.x - 1) / threadsPerBlock.x, (m + threadsPerBlock.y - 1) / threadsPerBlock.y); copy_mat_gpu<<>>(m, n, A, lda, A_cpy, ldat, copy_upper_triangle); diff --git a/test/drivers/test_cqrrp_gpu.cu b/test/drivers/test_cqrrp_gpu.cu index c974e28e..84abc625 100644 --- a/test/drivers/test_cqrrp_gpu.cu +++ b/test/drivers/test_cqrrp_gpu.cu @@ -217,7 +217,6 @@ class TestCQRRP : public ::testing::TestWithParam auto n = all_data.col; CQRRP_GPU.call(m, n, all_data.A_device, m, all_data.A_sk_device, d, all_data.tau_device, all_data.J_device); - /* all_data.rank = CQRRP_GPU.rank; printf("RANK AS RETURNED BY CQRRP GPU %4ld\n", all_data.rank); @@ -234,7 +233,6 @@ class TestCQRRP : public ::testing::TestWithParam RandLAPACK::util::col_swap(m, n, n, all_data.A_cpy2.data(), m, all_data.J); error_check(norm_A, all_data); - */ } template