Fixed J copy bug & reverted the col_swap change

BallisticLA · Aug 21, 2024 · f1e2299 · f1e2299
1 parent 815530d
commit f1e2299
Show file tree

Hide file tree

Showing 3 changed files with 11 additions and 11 deletions.
diff --git a/RandLAPACK/drivers/rl_cqrrp_gpu.hh b/RandLAPACK/drivers/rl_cqrrp_gpu.hh
@@ -610,7 +610,7 @@ int CQRRP_blocked_GPU<T, RNG>::call(
                 J_copy_col_swap = J;
                 J = J_cpy_buf;
             }
-            for(int odd_idx = 1; odd_idx < iter; odd_idx += 2) {
+            for(int odd_idx = 1; odd_idx <= iter; odd_idx += 2) {
                 RandLAPACK::cuda_kernels::copy_gpu(strm, b_sz_const, &J_copy_col_swap[odd_idx * b_sz_const], 1, &J[odd_idx * b_sz_const], 1);
             }
             lapack_queue.sync();

diff --git a/RandLAPACK/gpu_functions/rl_cuda_kernels.cuh b/RandLAPACK/gpu_functions/rl_cuda_kernels.cuh
@@ -318,20 +318,21 @@ __global__ void col_swap_gpu_kernel(
     int64_t ldac,
     int64_t const* idx
 ) {
-
+    /*
     for (int64_t i = threadIdx.x + blockDim.x * blockIdx.x; i < k; i += blockDim.x * gridDim.x) {
         for (int64_t j = threadIdx.y + blockDim.y * blockIdx.y; i < m; i += blockDim.y * gridDim.y) {
             int64_t l = idx[i] - 1; 
             A[i * lda + j] = A_cpy[l * ldac + j];
         }
     }
+    */
 
-    // int colIdx = blockIdx.x * blockDim.x + threadIdx.x; 
-    // int rowIdx = blockIdx.y * blockDim.y + threadIdx.y;
-    // if (colIdx < k && rowIdx < m) {
-    //     int64_t j = idx[colIdx] - 1; 
-    //     A[colIdx * lda + rowIdx] = A_cpy[j * ldac + rowIdx];
-    // }
+    int colIdx = blockIdx.x * blockDim.x + threadIdx.x; 
+    int rowIdx = blockIdx.y * blockDim.y + threadIdx.y;
+    if (colIdx < k && rowIdx < m) {
+        int64_t j = idx[colIdx] - 1; 
+        A[colIdx * lda + rowIdx] = A_cpy[j * ldac + rowIdx];
+    }
 }
 
 template <typename T>
@@ -469,7 +470,8 @@ void copy_mat_gpu(
     bool copy_upper_triangle
 ) {
 #ifdef USE_CUDA
-    dim3 threadsPerBlock(32, 8);
+    //dim3 threadsPerBlock(32, 8);
+    dim3 threadsPerBlock(11, 11);
     dim3 numBlocks((n + threadsPerBlock.x - 1) / threadsPerBlock.x,
                    (m + threadsPerBlock.y - 1) / threadsPerBlock.y);
     copy_mat_gpu<<<numBlocks, threadsPerBlock, 0, stream>>>(m, n, A, lda, A_cpy, ldat, copy_upper_triangle);

diff --git a/test/drivers/test_cqrrp_gpu.cu b/test/drivers/test_cqrrp_gpu.cu
@@ -217,7 +217,6 @@ class TestCQRRP : public ::testing::TestWithParam<int64_t>
         auto n = all_data.col;
 
         CQRRP_GPU.call(m, n, all_data.A_device, m, all_data.A_sk_device, d, all_data.tau_device, all_data.J_device);
-        /*
         all_data.rank = CQRRP_GPU.rank;
         printf("RANK AS RETURNED BY CQRRP GPU %4ld\n", all_data.rank);
 
@@ -234,7 +233,6 @@ class TestCQRRP : public ::testing::TestWithParam<int64_t>
         RandLAPACK::util::col_swap(m, n, n, all_data.A_cpy2.data(), m, all_data.J);
 
         error_check(norm_A, all_data);
-        */
     }
 
     template <typename T, typename RNG, typename alg_gpu, typename alg_cpu>