Skip to content

Commit

Permalink
Switched copy_mat_gpu to a standard one
Browse files Browse the repository at this point in the history
  • Loading branch information
TeachRaccooon committed Aug 21, 2024
1 parent e5d07e3 commit 2d3f0d9
Showing 1 changed file with 4 additions and 3 deletions.
7 changes: 4 additions & 3 deletions RandLAPACK/drivers/rl_cqrrp_gpu.hh
Original file line number Diff line number Diff line change
Expand Up @@ -390,10 +390,11 @@ int CQRRP_blocked_GPU<T, RNG>::call(
// Need to premute trailing columns of the full R-factor.
// Remember that the R-factor is stored the upper-triangular portion of A.
// Pivoting the trailing R and the ``current'' A.
RandLAPACK::cuda_kernels::copy_mat_gpu(strm, m, cols, &A[lda * curr_sz], lda, A_copy_col_swap, lda, false);

//RandLAPACK::cuda_kernels::copy_mat_gpu(strm, m, cols, &A[lda * curr_sz], lda, A_copy_col_swap, lda, false);
blas::device_copy_matrix( m, cols, &A[lda * curr_sz], lda, A_copy_col_swap, lda, lapack_queue);

if(this -> timing) {
cudaStreamSynchronize(strm);
lapack_queue.sync();
nvtxRangePop();
copy_A_t_stop = high_resolution_clock::now();
copy_A_t_dur += duration_cast<microseconds>(copy_A_t_stop - copy_A_t_start).count();
Expand Down

0 comments on commit 2d3f0d9

Please sign in to comment.