Skip to content

Commit

Permalink
Small fix in regards to the col_swap_gpu
Browse files Browse the repository at this point in the history
  • Loading branch information
TeachRaccooon committed Aug 21, 2024
1 parent 0e59731 commit e61bcf3
Show file tree
Hide file tree
Showing 3 changed files with 12 additions and 12 deletions.
12 changes: 6 additions & 6 deletions RandLAPACK/gpu_functions/rl_cuda_kernels.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -319,10 +319,11 @@ __global__ void col_swap_gpu_kernel(
int64_t const* idx
) {
/*
for (int64_t i = threadIdx.x + blockDim.x * blockIdx.x; i < k; i += blockDim.x * gridDim.x) {
for (int64_t j = threadIdx.y + blockDim.y * blockIdx.y; i < m; i += blockDim.y * gridDim.y) {
int64_t l = idx[i] - 1;
A[i * lda + j] = A_cpy[l * ldac + j];
for (int64_t colIdx = threadIdx.x + blockDim.x * blockIdx.x; colIdx < k; colIdx += blockDim.x * gridDim.x) {
for (int64_t rowIdx = threadIdx.y + blockDim.y * blockIdx.y; rowIdx < m; rowIdx += blockDim.y * gridDim.y) {
int64_t j = idx[colIdx] - 1;
A[colIdx * lda + rowIdx] = A_cpy[j * ldac + rowIdx];
}
}
}
*/
Expand Down Expand Up @@ -470,8 +471,7 @@ void copy_mat_gpu(
bool copy_upper_triangle
) {
#ifdef USE_CUDA
//dim3 threadsPerBlock(32, 8);
dim3 threadsPerBlock(11, 11);
dim3 threadsPerBlock(32, 8);
dim3 numBlocks((n + threadsPerBlock.x - 1) / threadsPerBlock.x,
(m + threadsPerBlock.y - 1) / threadsPerBlock.y);
copy_mat_gpu<<<numBlocks, threadsPerBlock, 0, stream>>>(m, n, A, lda, A_cpy, ldat, copy_upper_triangle);
Expand Down
8 changes: 4 additions & 4 deletions test/drivers/test_cqrrp.cc
Original file line number Diff line number Diff line change
Expand Up @@ -142,11 +142,11 @@ class TestCQRRP : public ::testing::Test

// Note: If Subprocess killed exception -> reload vscode
TEST_F(TestCQRRP, CQRRP_blocked_full_rank_basic_070824_cpu) {
int64_t m = 300;//5000;
int64_t n = 300;//2000;
int64_t k = 300;
int64_t m = 5000;//5000;
int64_t n = 2800;//2000;
int64_t k = 2800;
double d_factor = 1;//1.0;
int64_t b_sz = 50;//500;
int64_t b_sz = 900;//500;
double norm_A = 0;
double tol = std::pow(std::numeric_limits<double>::epsilon(), 0.85);
auto state = RandBLAS::RNGState();
Expand Down
4 changes: 2 additions & 2 deletions test/drivers/test_cqrrp_gpu.cu
Original file line number Diff line number Diff line change
Expand Up @@ -353,11 +353,11 @@ TEST_F(TestCQRRP, CQRRP_GPU_070824) {
double norm_A = 0;
double tol = std::pow(std::numeric_limits<double>::epsilon(), 0.85);
auto state = RandBLAS::RNGState();
bool profile_runtime = false;
bool profile_runtime = true;

CQRRPTestData<double> all_data(m, n, k, d);
RandLAPACK::CQRRP_blocked_GPU<double, r123::Philox4x32> CQRRP_blocked_GPU(profile_runtime, tol, b_sz);
//CQRRP_blocked_GPU.use_qrf = true;
CQRRP_blocked_GPU.use_qrf = false;

RandLAPACK::gen::mat_gen_info<double> m_info(m, n, RandLAPACK::gen::gaussian);
//RandLAPACK::gen::mat_gen_info<double> m_info(m, n, RandLAPACK::gen::polynomial);
Expand Down

0 comments on commit e61bcf3

Please sign in to comment.