Skip to content

Commit

Permalink
Onto something
Browse files Browse the repository at this point in the history
  • Loading branch information
TeachRaccooon committed Aug 20, 2024
1 parent 0dbd295 commit 59ed0c0
Show file tree
Hide file tree
Showing 2 changed files with 43 additions and 17 deletions.
52 changes: 39 additions & 13 deletions RandLAPACK/drivers/rl_cqrrp_gpu.hh
Original file line number Diff line number Diff line change
Expand Up @@ -292,8 +292,8 @@ int CQRRP_blocked_GPU<T, RNG>::call(
T* A_sk_copy_col_swap;
cudaMallocAsync(&A_sk_copy_col_swap, sizeof(T) * d * n, strm);
int64_t* J_copy_col_swap;
int64_t* J_copy_col_swap_work = J_copy_col_swap;
cudaMallocAsync(&J_copy_col_swap, sizeof(int64_t) * n, strm);
int64_t* J_copy_col_swap_work = J_copy_col_swap;
T* A_sk_buf;
int64_t* J_cpy_buf;
//*******************POINTERS TO DATA REQUIRING ADDITIONAL STORAGE END*******************
Expand All @@ -307,6 +307,9 @@ int CQRRP_blocked_GPU<T, RNG>::call(

T* buf_cpu = ( T * ) calloc( d * n, sizeof( T ) );
T* buf_cpu_1 = ( T * ) calloc( m * n, sizeof( T ) );

int64_t* buf_vec_cpu = ( int64_t * ) calloc( n, sizeof( int64_t ) );
int64_t* buf_vec_cpu_1 = ( int64_t * ) calloc( n, sizeof( int64_t ) );
for(iter = 0; iter < maxiter; ++iter) {
nvtxRangePushA("Iteration");
// Make sure we fit into the available space
Expand Down Expand Up @@ -344,10 +347,10 @@ int CQRRP_blocked_GPU<T, RNG>::call(
copy_A_sk_t_start = high_resolution_clock::now();
}
// Apply pivots to A_sk
RandLAPACK::cuda_kernels::copy_mat_gpu(strm, sampling_dimension, cols, A_sk_work, d, A_sk_copy_col_swap, d, false);
//A_sk_buf = A_sk_copy_col_swap;
//A_sk_copy_col_swap = A_sk_work;
//A_sk_work = A_sk_buf;
//RandLAPACK::cuda_kernels::copy_mat_gpu(strm, sampling_dimension, cols, A_sk_work, d, A_sk_copy_col_swap, d, false);
A_sk_buf = A_sk_copy_col_swap;
A_sk_copy_col_swap = A_sk_work;
A_sk_work = A_sk_buf;
if(this -> timing) {
cudaStreamSynchronize(strm);
nvtxRangePop();
Expand Down Expand Up @@ -425,7 +428,6 @@ int CQRRP_blocked_GPU<T, RNG>::call(
}

if(this -> use_qrf) {
printf("AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA\n");
if(this -> timing) {
nvtxRangePushA("cholqr");
cholqr_t_start = high_resolution_clock::now();
Expand Down Expand Up @@ -529,10 +531,10 @@ int CQRRP_blocked_GPU<T, RNG>::call(
nvtxRangePushA("copy_J");
copy_J_t_start = high_resolution_clock::now();
}
RandLAPACK::cuda_kernels::copy_gpu(strm, cols, J_work, 1, J_copy_col_swap, 1);
//J_cpy_buf = J_copy_col_swap_work;
//J_copy_col_swap_work = J_work;
//J_work = J_cpy_buf;
//RandLAPACK::cuda_kernels::copy_gpu(strm, cols, J_work, 1, J_copy_col_swap_work, 1);
J_cpy_buf = J_copy_col_swap_work;
J_copy_col_swap_work = J_work;
J_work = J_cpy_buf;
if(this -> timing) {
cudaStreamSynchronize(strm);
nvtxRangePop();
Expand All @@ -541,7 +543,7 @@ int CQRRP_blocked_GPU<T, RNG>::call(
nvtxRangePushA("update_J");
updating_J_t_start = high_resolution_clock::now();
}
RandLAPACK::cuda_kernels::col_swap_gpu<T>(strm, cols, cols, J_work, J_copy_col_swap, J_buffer);
RandLAPACK::cuda_kernels::col_swap_gpu<T>(strm, cols, cols, J_work, J_copy_col_swap_work, J_buffer);
if(this -> timing) {
cudaStreamSynchronize(strm);
nvtxRangePop();
Expand All @@ -553,7 +555,22 @@ int CQRRP_blocked_GPU<T, RNG>::call(
}
// Advance the work pointer of the global pivot vector;
J_work = &J_work[b_sz];
//J_copy_col_swap_work = &J_copy_col_swap_work[b_sz];
J_copy_col_swap_work = &J_copy_col_swap_work[b_sz];

cudaStreamSynchronize(strm);
cudaMemcpy(buf_vec_cpu, J, n * sizeof(int64_t), cudaMemcpyDeviceToHost);
cudaMemcpy(buf_vec_cpu_1, J_copy_col_swap, n * sizeof(int64_t), cudaMemcpyDeviceToHost);

printf("\nPrinting J_work\n");
for(int idx = 0; idx < n; ++idx) {
printf("%ld\n", buf_vec_cpu[idx]);
}

printf("\nPrinting J_cpy\n");
for(int idx = 0; idx < n; ++idx) {
printf("%ld\n", buf_vec_cpu_1[idx]);
}


// Alternatively, instead of trmm + copy, we could perform a single gemm.
// Compute R11 = R11_full(1:b_sz, :) * R_sk
Expand Down Expand Up @@ -584,8 +601,17 @@ int CQRRP_blocked_GPU<T, RNG>::call(
// Termination criteria reached
this -> rank = curr_sz;

//RandLAPACK::cuda_kernels::copy_gpu(strm, n, J_work, 1, J_copy_col_swap, 1);
RandLAPACK::cuda_kernels::copy_gpu(strm, n - b_sz, &J_copy_col_swap[b_sz], 1, &J[b_sz], 1);
lapack_queue.sync();


printf("\nPrinting Final J\n");
cudaMemcpy(buf_vec_cpu_1, J, n * sizeof(int64_t), cudaMemcpyDeviceToHost);
for(int idx = 0; idx < n; ++idx) {
printf("%ld\n", buf_vec_cpu_1[idx]);
}



if(this -> timing) {
total_t_stop = high_resolution_clock::now();
Expand Down
8 changes: 4 additions & 4 deletions test/drivers/test_cqrrp_gpu.cu
Original file line number Diff line number Diff line change
Expand Up @@ -344,11 +344,11 @@ class TestCQRRP : public ::testing::TestWithParam<int64_t>

// Note: If Subprocess killed exception -> reload vscode
TEST_F(TestCQRRP, CQRRP_GPU_070824) {
int64_t m = 5000;//5000;
int64_t n = 2800;//2000;
int64_t k = 2800;
int64_t m = 9;//5000;
int64_t n = 9;//2000;
int64_t k = 9;
double d_factor = 1;//1.0;
int64_t b_sz = 900;//500;
int64_t b_sz = 3;//500;
int64_t d = d_factor * b_sz;
double norm_A = 0;
double tol = std::pow(std::numeric_limits<double>::epsilon(), 0.85);
Expand Down

0 comments on commit 59ed0c0

Please sign in to comment.