diff --git a/RandLAPACK/gpu_functions/rl_cuda_kernels.cuh b/RandLAPACK/gpu_functions/rl_cuda_kernels.cuh index 7f02fcca..8f75826b 100644 --- a/RandLAPACK/gpu_functions/rl_cuda_kernels.cuh +++ b/RandLAPACK/gpu_functions/rl_cuda_kernels.cuh @@ -319,10 +319,11 @@ __global__ void col_swap_gpu_kernel( int64_t const* idx ) { /* - for (int64_t i = threadIdx.x + blockDim.x * blockIdx.x; i < k; i += blockDim.x * gridDim.x) { - for (int64_t j = threadIdx.y + blockDim.y * blockIdx.y; i < m; i += blockDim.y * gridDim.y) { - int64_t l = idx[i] - 1; - A[i * lda + j] = A_cpy[l * ldac + j]; + for (int64_t colIdx = threadIdx.x + blockDim.x * blockIdx.x; colIdx < k; colIdx += blockDim.x * gridDim.x) { + for (int64_t rowIdx = threadIdx.y + blockDim.y * blockIdx.y; rowIdx < m; rowIdx += blockDim.y * gridDim.y) { + int64_t j = idx[colIdx] - 1; + A[colIdx * lda + rowIdx] = A_cpy[j * ldac + rowIdx]; + } } } */ @@ -470,8 +471,7 @@ void copy_mat_gpu( bool copy_upper_triangle ) { #ifdef USE_CUDA - //dim3 threadsPerBlock(32, 8); - dim3 threadsPerBlock(11, 11); + dim3 threadsPerBlock(32, 8); dim3 numBlocks((n + threadsPerBlock.x - 1) / threadsPerBlock.x, (m + threadsPerBlock.y - 1) / threadsPerBlock.y); copy_mat_gpu<<>>(m, n, A, lda, A_cpy, ldat, copy_upper_triangle); diff --git a/test/drivers/test_cqrrp.cc b/test/drivers/test_cqrrp.cc index 37dbd729..d2dd5b54 100644 --- a/test/drivers/test_cqrrp.cc +++ b/test/drivers/test_cqrrp.cc @@ -142,11 +142,11 @@ class TestCQRRP : public ::testing::Test // Note: If Subprocess killed exception -> reload vscode TEST_F(TestCQRRP, CQRRP_blocked_full_rank_basic_070824_cpu) { - int64_t m = 300;//5000; - int64_t n = 300;//2000; - int64_t k = 300; + int64_t m = 5000;//5000; + int64_t n = 2800;//2000; + int64_t k = 2800; double d_factor = 1;//1.0; - int64_t b_sz = 50;//500; + int64_t b_sz = 900;//500; double norm_A = 0; double tol = std::pow(std::numeric_limits::epsilon(), 0.85); auto state = RandBLAS::RNGState(); diff --git a/test/drivers/test_cqrrp_gpu.cu b/test/drivers/test_cqrrp_gpu.cu index 79c90a59..c10fba9e 100644 --- a/test/drivers/test_cqrrp_gpu.cu +++ b/test/drivers/test_cqrrp_gpu.cu @@ -353,11 +353,11 @@ TEST_F(TestCQRRP, CQRRP_GPU_070824) { double norm_A = 0; double tol = std::pow(std::numeric_limits::epsilon(), 0.85); auto state = RandBLAS::RNGState(); - bool profile_runtime = false; + bool profile_runtime = true; CQRRPTestData all_data(m, n, k, d); RandLAPACK::CQRRP_blocked_GPU CQRRP_blocked_GPU(profile_runtime, tol, b_sz); - //CQRRP_blocked_GPU.use_qrf = true; + CQRRP_blocked_GPU.use_qrf = false; RandLAPACK::gen::mat_gen_info m_info(m, n, RandLAPACK::gen::gaussian); //RandLAPACK::gen::mat_gen_info m_info(m, n, RandLAPACK::gen::polynomial);