From 2d3f0d96ebf38e16f0b95b8c1b4a8309beeb2cf3 Mon Sep 17 00:00:00 2001 From: TeachRaccooon Date: Wed, 21 Aug 2024 08:13:43 -0700 Subject: [PATCH] Switched copy_mat_gpu to a standard one --- RandLAPACK/drivers/rl_cqrrp_gpu.hh | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/RandLAPACK/drivers/rl_cqrrp_gpu.hh b/RandLAPACK/drivers/rl_cqrrp_gpu.hh index d555d470..eaa45218 100644 --- a/RandLAPACK/drivers/rl_cqrrp_gpu.hh +++ b/RandLAPACK/drivers/rl_cqrrp_gpu.hh @@ -390,10 +390,11 @@ int CQRRP_blocked_GPU::call( // Need to premute trailing columns of the full R-factor. // Remember that the R-factor is stored the upper-triangular portion of A. // Pivoting the trailing R and the ``current'' A. - RandLAPACK::cuda_kernels::copy_mat_gpu(strm, m, cols, &A[lda * curr_sz], lda, A_copy_col_swap, lda, false); - + //RandLAPACK::cuda_kernels::copy_mat_gpu(strm, m, cols, &A[lda * curr_sz], lda, A_copy_col_swap, lda, false); + blas::device_copy_matrix( m, cols, &A[lda * curr_sz], lda, A_copy_col_swap, lda, lapack_queue); + if(this -> timing) { - cudaStreamSynchronize(strm); + lapack_queue.sync(); nvtxRangePop(); copy_A_t_stop = high_resolution_clock::now(); copy_A_t_dur += duration_cast(copy_A_t_stop - copy_A_t_start).count();