diff --git a/.github/workflows/core-linux.yaml b/.github/workflows/core-linux.yaml index c399d48f..13fe44e9 100644 --- a/.github/workflows/core-linux.yaml +++ b/.github/workflows/core-linux.yaml @@ -1,7 +1,12 @@ name: core-linux on: pull_request: + branches-ignore: + - cqrrp-gpu-benchmarking workflow_dispatch: + push: + branches-ignore: + - cqrrp-gpu-benchmarking jobs: build: diff --git a/.github/workflows/core-macos.yaml b/.github/workflows/core-macos.yaml index ecbe540f..76a8c59b 100644 --- a/.github/workflows/core-macos.yaml +++ b/.github/workflows/core-macos.yaml @@ -1,7 +1,12 @@ name: core-macos on: pull_request: + branches-ignore: + - cqrrp-gpu-benchmarking workflow_dispatch: + push: + branches-ignore: + - cqrrp-gpu-benchmarking jobs: build: diff --git a/.gitignore b/.gitignore index 1cdca533..eb5e6dd7 100644 --- a/.gitignore +++ b/.gitignore @@ -1,4 +1,5 @@ benchmark/build/** +**/private_config.sh # vim *.sw* diff --git a/RandLAPACK/drivers/rl_cqrrp.hh b/RandLAPACK/drivers/rl_cqrrp.hh index bfdf3d18..ab3065d2 100644 --- a/RandLAPACK/drivers/rl_cqrrp.hh +++ b/RandLAPACK/drivers/rl_cqrrp.hh @@ -56,16 +56,13 @@ class CQRRP_blocked : public CQRRPalg { T ep, int64_t b_sz ) { - timing = time_subroutines; - eps = ep; - timing = time_subroutines; - eps = ep; - block_size = b_sz; - use_qp3 = false; - use_gemqrt = false; - internal_nb = b_sz; - tol = std::numeric_limits::epsilon(); + timing = time_subroutines; + eps = ep; + timing = time_subroutines; + eps = ep; + block_size = b_sz; use_qp3 = false; + use_qrf = false; use_gemqrt = false; internal_nb = b_sz; tol = std::numeric_limits::epsilon(); @@ -134,6 +131,9 @@ class CQRRP_blocked : public CQRRPalg { // QRCP option bool use_qp3; + // Option to use GEQRF on a panel + bool use_qrf; + // Option for updating A bool use_gemqrt; @@ -168,8 +168,8 @@ int CQRRP_blocked::call( high_resolution_clock::time_point skop_t_start; high_resolution_clock::time_point qrcp_t_start; high_resolution_clock::time_point qrcp_t_stop; - high_resolution_clock::time_point cholqr_t_start; - high_resolution_clock::time_point cholqr_t_stop; + high_resolution_clock::time_point panelqr_t_start; + high_resolution_clock::time_point panelqr_t_stop; high_resolution_clock::time_point reconstruction_t_start; high_resolution_clock::time_point reconstruction_t_stop; high_resolution_clock::time_point preconditioning_t_start; @@ -187,7 +187,7 @@ int CQRRP_blocked::call( long preallocation_t_dur = 0; long skop_t_dur = 0; long qrcp_t_dur = 0; - long cholqr_t_dur = 0; + long panelqr_t_dur = 0; long reconstruction_t_dur = 0; long preconditioning_t_dur = 0; long r_piv_t_dur = 0; @@ -347,22 +347,14 @@ int CQRRP_blocked::call( if(this -> timing) { qrcp_t_stop = high_resolution_clock::now(); qrcp_t_dur += duration_cast(qrcp_t_stop - qrcp_t_start).count(); - r_piv_t_start = high_resolution_clock::now(); + preconditioning_t_start = high_resolution_clock::now(); } // Need to premute trailing columns of the full R-factor. // Remember that the R-factor is stored the upper-triangular portion of A. - if(iter != 0) - util::col_swap(curr_sz, cols, cols, &A[lda * curr_sz], m, J_buf); - - if(this -> timing) { - r_piv_t_stop = high_resolution_clock::now(); - r_piv_t_dur += duration_cast(r_piv_t_stop - r_piv_t_start).count(); - preconditioning_t_start = high_resolution_clock::now(); - } - - // Pivoting the current matrix A. - util::col_swap(rows, cols, cols, A_work, lda, J_buf); + // Pivoting the trailing R and the ``current'' A. + // The copy of A operation is done on a separete stream. If it was not, it would have been done here. + util::col_swap(m, cols, cols, &A[lda * curr_sz], lda, J_buf); // Checking for the zero matrix post-pivoting is the best idea, // as we would only need to check one column (pivoting moves the column with the largest norm upfront) @@ -424,47 +416,55 @@ int CQRRP_blocked::call( } if(this -> timing) - cholqr_t_start = high_resolution_clock::now(); + panelqr_t_start = high_resolution_clock::now(); + + if (use_qrf) { + // Performing QRF on a panel - this skips ORHR_COL and tau extraction + tau_sub = &tau[curr_sz]; + lapack::geqrf(rows, block_rank, A_work, lda, tau_sub); + // Need to copy R into a separate buffer because there is no trtrmm in LAPACK. + lapack::lacpy(MatrixType::Upper, block_rank, block_rank, A_work, lda, R_cholqr, b_sz_const); + } else { + // Performing Cholesky QR on a panel + blas::syrk(Layout::ColMajor, Uplo::Upper, Op::Trans, block_rank, rows, (T) 1.0, A_work, lda, (T) 0.0, R_cholqr, b_sz_const); + lapack::potrf(Uplo::Upper, block_rank, R_cholqr, b_sz_const); + // Compute Q_econ from Cholesky QR + blas::trsm(Layout::ColMajor, Side::Right, Uplo::Upper, Op::NoTrans, Diag::NonUnit, rows, block_rank, (T) 1.0, R_cholqr, b_sz_const, A_work, lda); - // Performing Cholesky QR - blas::syrk(Layout::ColMajor, Uplo::Upper, Op::Trans, block_rank, rows, (T) 1.0, A_work, lda, (T) 0.0, R_cholqr, b_sz_const); - lapack::potrf(Uplo::Upper, block_rank, R_cholqr, b_sz_const); - // Compute Q_econ from Cholesky QR - blas::trsm(Layout::ColMajor, Side::Right, Uplo::Upper, Op::NoTrans, Diag::NonUnit, rows, block_rank, (T) 1.0, R_cholqr, b_sz_const, A_work, lda); + if(this -> timing) { + panelqr_t_stop = high_resolution_clock::now(); + panelqr_t_dur += duration_cast(panelqr_t_stop - panelqr_t_start).count(); + reconstruction_t_start = high_resolution_clock::now(); + } - if(this -> timing) { - cholqr_t_stop = high_resolution_clock::now(); - cholqr_t_dur += duration_cast(cholqr_t_stop - cholqr_t_start).count(); - reconstruction_t_start = high_resolution_clock::now(); - } + // Find Q (stored in A) using Householder reconstruction. + // This will represent the full (rows by rows) Q factor form Cholesky QR + // It would have been really nice to store T right above Q, but without using extra space, + // it would result in us loosing the first lower-triangular b_sz by b_sz portion of implicitly-stored Q. + // Filling T without ever touching its lower-triangular space would be a nice optimization for orhr_col routine. + // Q is defined with block_rank elementary reflectors. + // NOTE: + /// This routine is defined in LAPACK 3.9.0. + lapack::orhr_col(rows, block_rank, internal_nb, A_work, lda, T_dat, b_sz_const, Work2); - // Find Q (stored in A) using Householder reconstruction. - // This will represent the full (rows by rows) Q factor form Cholesky QR - // It would have been really nice to store T right above Q, but without using extra space, - // it would result in us loosing the first lower-triangular b_sz by b_sz portion of implicitly-stored Q. - // Filling T without ever touching its lower-triangular space would be a nice optimization for orhr_col routine. - // Q is defined with block_rank elementary reflectors. - // NOTE: - /// This routine is defined in LAPACK 3.9.0. - lapack::orhr_col(rows, block_rank, internal_nb, A_work, lda, T_dat, b_sz_const, Work2); - - // Need to change signs in the R-factor from Cholesky QR. - // Signs correspond to matrix D from orhr_col(). - // This allows us to not explicitoly compute R11_full = (Q[:, 1:block_rank])' * A_pre. - for(i = 0; i < block_rank; ++i) - for(j = 0; j < (i + 1); ++j) - R_cholqr[(b_sz_const * i) + j] *= Work2[j]; - - // Define a pointer to the current subportion of tau vector. - tau_sub = &tau[curr_sz]; - // Entries of tau will be placed on the main diagonal of the block matrix T from orhr_col(). - for(i = 0; i < block_rank; ++i) - tau_sub[i] = T_dat[(b_sz_const * i) + (i % internal_nb)]; + if(this -> timing) { + reconstruction_t_stop = high_resolution_clock::now(); + reconstruction_t_dur += duration_cast(reconstruction_t_stop - reconstruction_t_start).count(); + updating1_t_start = high_resolution_clock::now(); + } - if(this -> timing) { - reconstruction_t_stop = high_resolution_clock::now(); - reconstruction_t_dur += duration_cast(reconstruction_t_stop - reconstruction_t_start).count(); - updating1_t_start = high_resolution_clock::now(); + // Need to change signs in the R-factor from Cholesky QR. + // Signs correspond to matrix D from orhr_col(). + // This allows us to not explicitoly compute R11_full = (Q[:, 1:block_rank])' * A_pre. + for(i = 0; i < block_rank; ++i) + for(j = 0; j < (i + 1); ++j) + R_cholqr[(b_sz_const * i) + j] *= Work2[j]; + + // Define a pointer to the current subportion of tau vector. + tau_sub = &tau[curr_sz]; + // Entries of tau will be placed on the main diagonal of the block matrix T from orhr_col(). + for(i = 0; i < block_rank; ++i) + tau_sub[i] = T_dat[(b_sz_const * i) + (i % internal_nb)]; } // Perform Q_full' * A_piv(:, b_sz:end) to find R12 and the new "current A." @@ -474,7 +474,8 @@ int CQRRP_blocked::call( // With that, everything is placed where it should be, no copies required. // Q is defined with block_rank elementary reflectors. // GEMQRT is a faster alternative to ORMQR, takes in the matrix T instead of vector tau. - if(use_gemqrt) { + // Using QRF prevents us from using gemqrt unless matrix T was explicitly constructed. + if(use_gemqrt && !use_qrf) { lapack::gemqrt(Side::Left, Op::Trans, rows, cols - b_sz, block_rank, internal_nb, A_work, lda, T_dat, b_sz_const, Work1, lda); } else { lapack::ormqr(Side::Left, Op::Trans, rows, cols - b_sz, block_rank, A_work, lda, tau_sub, Work1, lda); @@ -532,14 +533,14 @@ int CQRRP_blocked::call( total_t_dur = duration_cast(total_t_stop - total_t_start).count(); long t_rest = total_t_dur - (preallocation_t_dur + skop_t_dur + qrcp_t_dur + reconstruction_t_dur + preconditioning_t_dur + updating1_t_dur + updating2_t_dur + updating3_t_dur + r_piv_t_dur); this -> times.resize(12); - this -> times = {skop_t_dur, preallocation_t_dur, qrcp_t_dur, preconditioning_t_dur, cholqr_t_dur, reconstruction_t_dur, updating1_t_dur, updating2_t_dur, updating3_t_dur, r_piv_t_dur, t_rest, total_t_dur}; + this -> times = {skop_t_dur, preallocation_t_dur, qrcp_t_dur, preconditioning_t_dur, panelqr_t_dur, reconstruction_t_dur, updating1_t_dur, updating2_t_dur, updating3_t_dur, r_piv_t_dur, t_rest, total_t_dur}; printf("\n\n/------------CQRRP TIMING RESULTS BEGIN------------/\n"); printf("Preallocation time: %25ld μs,\n", preallocation_t_dur); printf("skop time: %34ld μs,\n", skop_t_dur); printf("QRCP time: %36ld μs,\n", qrcp_t_dur); printf("Preconditioning time: %24ld μs,\n", preconditioning_t_dur); - printf("CholQR time: %32ld μs,\n", cholqr_t_dur); + printf("CholQR time: %32ld μs,\n", panelqr_t_dur); printf("Householder vector restoration time: %7ld μs,\n", reconstruction_t_dur); printf("Computing A_new, R12 time: %23ld μs,\n", updating1_t_dur); printf("Factors updating time: %23ld μs,\n", updating3_t_dur); @@ -552,7 +553,7 @@ int CQRRP_blocked::call( printf("skop generation and application takes %2.2f%% of runtime.\n", 100 * ((T) skop_t_dur / (T) total_t_dur)); printf("QRCP takes %32.2f%% of runtime.\n", 100 * ((T) qrcp_t_dur / (T) total_t_dur)); printf("Preconditioning takes %20.2f%% of runtime.\n", 100 * ((T) preconditioning_t_dur / (T) total_t_dur)); - printf("Cholqr takes %29.2f%% of runtime.\n", 100 * ((T) cholqr_t_dur / (T) total_t_dur)); + printf("Cholqr takes %29.2f%% of runtime.\n", 100 * ((T) panelqr_t_dur / (T) total_t_dur)); printf("Householder restoration takes %12.2f%% of runtime.\n", 100 * ((T) reconstruction_t_dur / (T) total_t_dur)); printf("Computing A_new, R12 takes %14.2f%% of runtime.\n", 100 * ((T) updating1_t_dur / (T) total_t_dur)); printf("Factors updating time takes %14.2f%% of runtime.\n", 100 * ((T) updating3_t_dur / (T) total_t_dur)); diff --git a/RandLAPACK/drivers/rl_cqrrp_gpu.hh b/RandLAPACK/drivers/rl_cqrrp_gpu.hh index e4d12aef..d89a2ff3 100644 --- a/RandLAPACK/drivers/rl_cqrrp_gpu.hh +++ b/RandLAPACK/drivers/rl_cqrrp_gpu.hh @@ -220,7 +220,6 @@ int CQRRP_blocked_GPU::call( /******************************STREAM/QUEUE/HANDLE*********************************/ lapack::Queue lapack_queue(0); cudaStream_t strm = lapack_queue.stream(); - lapack::Queue copy_queue{0}; using lapack::device_info_int; device_info_int* d_info = blas::device_malloc< device_info_int >( 1, lapack_queue ); int *d_info_cusolver = nullptr; @@ -303,14 +302,16 @@ int CQRRP_blocked_GPU::call( // This strategy would still require using buffers of size of the original data. T* A_copy_col_swap; cudaMallocAsync(&A_copy_col_swap, sizeof(T) * m * n, strm); + T* A_copy_col_swap_work = A_copy_col_swap; + T* A_sk_copy_col_swap; cudaMallocAsync(&A_sk_copy_col_swap, sizeof(T) * d * n, strm); T* A_sk_copy_col_swap_work = A_sk_copy_col_swap; + int64_t* J_copy_col_swap; cudaMallocAsync(&J_copy_col_swap, sizeof(int64_t) * n, strm); int64_t* J_copy_col_swap_work = J_copy_col_swap; - // Pointer buffer required for our special data movement-avoiding strategy. - int64_t* J_cpy_buf; + //*******************POINTERS TO DATA REQUIRING ADDITIONAL STORAGE END******************* cudaStreamSynchronize(strm); if(this -> timing) { @@ -322,9 +323,6 @@ int CQRRP_blocked_GPU::call( for(iter = 0; iter < maxiter; ++iter) { nvtxRangePushA("Iteration"); - // start async copy -- look for copy_queue.sync() for completion - blas::device_copy_matrix( m, cols, &A[lda * curr_sz], lda, A_copy_col_swap, lda, copy_queue); - // Make sure we fit into the available space b_sz = std::min(this->block_size, std::min(m, n) - curr_sz); block_rank = b_sz; @@ -405,7 +403,6 @@ int CQRRP_blocked_GPU::call( // Pivoting the trailing R and the ``current'' A. // The copy of A operation is done on a separete stream. If it was not, it would have been done here. - copy_queue.sync(); if(this -> timing) { nvtxRangePop(); copy_A_t_stop = high_resolution_clock::now(); @@ -414,7 +411,27 @@ int CQRRP_blocked_GPU::call( piv_A_t_start = high_resolution_clock::now(); } - RandLAPACK::cuda_kernels::col_swap_gpu(strm, m, cols, cols, &A[lda * curr_sz], lda, A_copy_col_swap, lda, J_buffer); + // Instead of copying A into A_copy_col_swap, we ``swap'' the pointers. + // We have to take some precautions when ICQRRP main loop terminates. + // Since we want A to be accessible and valid outside of ICQRRP, we need to make sure that + // its entries were, in fact, computed correctly. + // + // The original memory space that the matrix A points to would only contain the correct entry ranges, computed at ODD + // iterations of ICQRRP's main loop. + // The correct entries from the even iterations would be contained in the memory space that was originally pointed to + // by A_copy_col_swap. + // Hence, when ICQRRP terminates, we would need to copy the results from the even iterations form A_copy_col_swap to A. + // + // Remember that since the pointers A and A_copy_col_swap are swapped at every even iteration of the main ICQRRP loop, + // if the ICQRRP terminates with iter being even, we would need to swap these pointers back around. + // Recall also that if A and A_cpy needed to be swapped at termination and iter != maxiters, A_cpy would contain the "correct" + // entries in column range ((iter + 1) * b_sz : end), so we need to not forget to copy those over into A. + // + // Additional thing to remember is that the final copy needs to be performed in terms of b_sz_const, not b_sz. + std::swap(A_copy_col_swap, A); + A_work = &A[lda * curr_sz + curr_sz]; + A_copy_col_swap_work = &A_copy_col_swap[lda * curr_sz + curr_sz]; + RandLAPACK::cuda_kernels::col_swap_gpu(strm, m, cols, cols, &A[lda * curr_sz], lda, &A_copy_col_swap[lda * curr_sz], lda, J_buffer); // Checking for the zero matrix post-pivoting is the best idea, // as we would only need to check one column (pivoting moves the column with the largest norm upfront) @@ -432,19 +449,33 @@ int CQRRP_blocked_GPU::call( } this -> rank = curr_sz; - // Measures taken to insure J holds correct data, explained above. + // Measures taken to ensure J holds correct data, explained above. if(iter % 2) { - // Total number of iterations is even (iter starts at 0) - J_cpy_buf = J_copy_col_swap; - J_copy_col_swap = J; - J = J_cpy_buf; + // Total number of iterations is odd (iter starts at 0) + std::swap(J_copy_col_swap, J); + } else { + // Total number of iterations is even + std::swap(A_copy_col_swap, A); + if(iter != (maxiter - 1)){ + // Copy trailing portion of A_cpy into A + blas::device_copy_matrix(m, n - (iter + 1) * b_sz_const, &A_copy_col_swap[lda * (iter + 1) * b_sz_const], lda, &A[lda * (iter + 1) * b_sz_const], lda, lapack_queue); + } } - for(int odd_idx = 1; odd_idx <= iter; odd_idx += 2) { - if(odd_idx == iter) { - // Do not copy extra data if b_sz has changed. - RandLAPACK::cuda_kernels::copy_gpu(strm, b_sz, &J_copy_col_swap[odd_idx * b_sz_const], 1, &J[odd_idx * b_sz_const], 1); - } else { - RandLAPACK::cuda_kernels::copy_gpu(strm, b_sz_const, &J_copy_col_swap[odd_idx * b_sz_const], 1, &J[odd_idx * b_sz_const], 1); + for (int idx = 0; idx <= iter; ++idx) { + if (idx % 2) { // Odd index - copy portions of J + if (idx == iter) { + // Avoid copying extra entries if b_sz has changed + RandLAPACK::cuda_kernels::copy_gpu(strm, b_sz, &J_copy_col_swap[idx * b_sz_const], 1, &J[idx * b_sz_const], 1); + } else { + RandLAPACK::cuda_kernels::copy_gpu(strm, b_sz_const, &J_copy_col_swap[idx * b_sz_const], 1, &J[idx * b_sz_const], 1); + } + } else { // Even index - copy portions of A + if (idx == iter) { + // Avoid copying extra entries if b_sz has changed + blas::device_copy_matrix(m, b_sz, &A_copy_col_swap[lda * idx * b_sz_const], lda, &A[lda * idx * b_sz_const], lda, lapack_queue); + } else { + blas::device_copy_matrix(m, b_sz_const, &A_copy_col_swap[lda * idx * b_sz_const], lda, &A[lda * idx * b_sz_const], lda, lapack_queue); + } } } lapack_queue.sync(); @@ -507,9 +538,9 @@ int CQRRP_blocked_GPU::call( std::vector h_work_geqrf_vector_opt( h_size_geqrf_opt ); h_work_geqrf_opt = h_work_geqrf_vector_opt.data(); } - lapack::geqrf(rows, b_sz, A_work, lda, &tau[curr_sz], d_work_geqrf_opt, d_size_geqrf_opt, h_work_geqrf_opt, h_size_geqrf_opt, d_info, lapack_queue); + lapack::geqrf(rows, block_rank, A_work, lda, &tau[curr_sz], d_work_geqrf_opt, d_size_geqrf_opt, h_work_geqrf_opt, h_size_geqrf_opt, d_info, lapack_queue); //R_cholqr = A_work; - RandLAPACK::cuda_kernels::copy_mat_gpu(strm, b_sz, b_sz, A_work, lda, R_cholqr, b_sz_const, true); + RandLAPACK::cuda_kernels::copy_mat_gpu(strm, block_rank, block_rank, A_work, lda, R_cholqr, b_sz_const, true); if(this -> timing) { cudaStreamSynchronize(strm); @@ -612,16 +643,15 @@ int CQRRP_blocked_GPU::call( // its entries were, in fact, computed correctly. // // The original memory space that the vector J points to would only contain the correct pivot ranges, computed at EVEN - // iterations of ICQRRP's main loop. + // iterations of ICQRRP's main loop (by contrast to the situation with matrix A, since the pointers J and J_cpy do not get swapped at iteration 0). // The correct entries from the odd iterations would be contained in the memory space that was originbally pointed to // by J_copy_col_swap. // Hence, when ICQRRP terminates, we would need to copy the results from the odd iterations form J_copy_col_swap to J. // - // Remember that since the pointers J and J_copy_col_swap are swapped at every even iteration of the main ICQRRP loop, - // if the ICQRRP terminates with iter being even, we would need to swap these pointers back around. + // Remember that since the pointers J and J_copy_col_swap are swapped at every odd iteration of the main ICQRRP loop, + // if the ICQRRP terminates with iter being odd, we would need to swap these pointers back around. // // Additional thing to remember is that the final copy needs to be performed in terms of b_sz_const, not b_sz. - // No need to worry about the altered b_sz when performing a copy, because it is always placed where it should be in J. std::swap(J_copy_col_swap, J); J_work = &J[curr_sz]; J_copy_col_swap_work = &J_copy_col_swap[curr_sz]; @@ -680,21 +710,38 @@ int CQRRP_blocked_GPU::call( this -> rank = curr_sz; // Measures taken to insure J holds correct data, explained above. if(iter % 2) { - // Total number of iterations is even (iter starts at 0) - J_cpy_buf = J_copy_col_swap; - J_copy_col_swap = J; - J = J_cpy_buf; + // Total number of iterations is odd (iter starts at 0) + std::swap(J_copy_col_swap, J); + } else { + // Total number of iterations is even + std::swap(A_copy_col_swap, A); + // In addition to the copy from A_cpy to A space below, we also need to account for the cases when early termination has occured (iter != maxiters - 1), and pointers A and A_cpy need to switch places, + // Aka when A_cpy has the "correct" trailing entries. + // This means that the all entries from (iter + 1) * b_sz to end need to be copied over from A_cpy to A. + // It is most likely the case that these trailing entries are all 0, but in order to be extra safe, we shall perform a full copy. + if(iter != (maxiter - 1)){ + blas::device_copy_matrix(m, n - (iter + 1) * b_sz_const, &A_copy_col_swap[lda * (iter + 1) * b_sz_const], lda, &A[lda * (iter + 1) * b_sz_const], lda, lapack_queue); + } } - for(int odd_idx = 1; odd_idx <= iter; odd_idx += 2) { - if(odd_idx == iter) { - // Do not copy extra data if b_sz has changed. - RandLAPACK::cuda_kernels::copy_gpu(strm, b_sz, &J_copy_col_swap[odd_idx * b_sz_const], 1, &J[odd_idx * b_sz_const], 1); - } else { - RandLAPACK::cuda_kernels::copy_gpu(strm, b_sz_const, &J_copy_col_swap[odd_idx * b_sz_const], 1, &J[odd_idx * b_sz_const], 1); + for (int idx = 0; idx <= iter; ++idx) { + if (idx % 2) { // Odd index - copy portions of J + if (idx == iter) { + // Avoid copying extra entries if b_sz has changed + RandLAPACK::cuda_kernels::copy_gpu(strm, b_sz, &J_copy_col_swap[idx * b_sz_const], 1, &J[idx * b_sz_const], 1); + } else { + RandLAPACK::cuda_kernels::copy_gpu(strm, b_sz_const, &J_copy_col_swap[idx * b_sz_const], 1, &J[idx * b_sz_const], 1); + } + } else { // Even index - copy portions of A + if (idx == iter) { + // Avoid copying extra entries if b_sz has changed + blas::device_copy_matrix(m, b_sz, &A_copy_col_swap[lda * idx * b_sz_const], lda, &A[lda * idx * b_sz_const], lda, lapack_queue); + } else { + blas::device_copy_matrix(m, b_sz_const, &A_copy_col_swap[lda * idx * b_sz_const], lda, &A[lda * idx * b_sz_const], lda, lapack_queue); + } } } lapack_queue.sync(); - + if(this -> timing) { total_t_stop = high_resolution_clock::now(); total_t_dur = duration_cast(total_t_stop - total_t_start).count(); diff --git a/RandLAPACK/drivers/rl_cqrrpt.hh b/RandLAPACK/drivers/rl_cqrrpt.hh index 360a8143..cd8bf861 100644 --- a/RandLAPACK/drivers/rl_cqrrpt.hh +++ b/RandLAPACK/drivers/rl_cqrrpt.hh @@ -205,7 +205,13 @@ int CQRRPT::call( Layout::ColMajor, Op::NoTrans, Op::NoTrans, d, n, m, (T) 1.0, S, 0, 0, A, lda, (T) 0.0, A_hat, d ); - +/* + T* S = ( T * ) calloc( d * m, sizeof( T ) ); + RandBLAS::DenseDist D(d, m); + state = RandBLAS::fill_dense(D, S, state).second; + blas::gemm(Layout::ColMajor, Op::NoTrans, Op::NoTrans, d, n, m, 1.0, S, d, A, m, 0.0, A_hat, d); + free(S); +*/ if(this -> timing) { saso_t_stop = high_resolution_clock::now(); qrcp_t_start = high_resolution_clock::now(); diff --git a/RandLAPACK/drivers/rl_hqrrp.hh b/RandLAPACK/drivers/rl_hqrrp.hh index b84a34f2..fcb4a40e 100644 --- a/RandLAPACK/drivers/rl_hqrrp.hh +++ b/RandLAPACK/drivers/rl_hqrrp.hh @@ -1191,7 +1191,7 @@ int64_t hqrrp( printf("Updating A takes %14.2f%% of runtime.\n", 100 * ((T) updating_A_t_dur / (T) total_t_dur)); printf("Updating Sketch takes %14.2f%% of runtime.\n", 100 * ((T) updating_Sketch_t_dur / (T) total_t_dur)); printf("Everything else takes %20.2f%% of runtime.\n", 100 * ((T) other_t_dur / (T) total_t_dur)); - printf("/-------------CQRRP TIMING RESULTS END-------------/\n\n"); + printf("/-------------HQRRP TIMING RESULTS END-------------/\n\n"); } // Remove auxiliary objects. diff --git a/RandLAPACK/misc/rl_gen.hh b/RandLAPACK/misc/rl_gen.hh index 1068ffad..83f592ae 100644 --- a/RandLAPACK/misc/rl_gen.hh +++ b/RandLAPACK/misc/rl_gen.hh @@ -241,13 +241,14 @@ void gen_spiked_mat( T spike_scale, RandBLAS::RNGState &state ) { + int64_t num_rows_sampled = n / 2; /// sample from [m] without replacement. Get the row indices for a tall LASO with a single column. RandBLAS::SparseDist DS = {.n_rows = m, .n_cols = 1, .vec_nnz = num_rows_sampled, .major_axis = RandBLAS::MajorAxis::Long}; RandBLAS::SparseSkOp S(DS, state); state = RandBLAS::fill_sparse(S); - + T* V = ( T * ) calloc( n * n, sizeof( T ) ); T* tau = ( T * ) calloc( n, sizeof( T ) ); @@ -258,13 +259,12 @@ void gen_spiked_mat( lapack::ungqr(n, n, n, V, n, tau); // Fill A with stacked copies of V - int start = 0; + int64_t size = 0; int i, j; - while(start + n <= m){ - for( j = 0; j < n; ++j) { - blas::copy(n, &V[m * j], 1, &A[start + (m * j)], 1); - } - start += n; + + while(size < m){ + lapack::lacpy(MatrixType::General, std::min(n, m - size), n, V, n, &A[size], m); + size += std::min(n, m - size); } for (i = 0; i < n; ++ i) { diff --git a/benchmark/CMakeLists.txt b/benchmark/CMakeLists.txt index 8be3b062..f74c5f32 100644 --- a/benchmark/CMakeLists.txt +++ b/benchmark/CMakeLists.txt @@ -82,13 +82,15 @@ set( ) # Performance profiling through GEMM -add_benchmark(NAME GEMM_flop_count CXX_SOURCES bench_general/GEMM_flop_count.cc LINK_LIBS ${Benchmark_libs}) +add_benchmark(NAME GEMM_flop_count CXX_SOURCES bench_general/GEMM_flop_count.cc LINK_LIBS ${Benchmark_libs}) # Lapack functionality benchmark -add_benchmark(NAME Chol_check CXX_SOURCES bench_general/Chol_check.cc LINK_LIBS ${Benchmark_libs}) +add_benchmark(NAME Chol_check CXX_SOURCES bench_general/Chol_check.cc LINK_LIBS ${Benchmark_libs}) # Data conversion helper script -add_benchmark(NAME convert_time CXX_SOURCES bench_general/convert_time.cc LINK_LIBS ${Benchmark_libs}) +add_benchmark(NAME convert_time CXX_SOURCES bench_general/convert_time.cc LINK_LIBS ${Benchmark_libs}) # Compare GEMM and ORMQR performance -add_benchmark(NAME Gemm_vs_ormqr CXX_SOURCES bench_general/Gemm_vs_ormqr.cc LINK_LIBS ${Benchmark_libs}) +add_benchmark(NAME Gemm_vs_ormqr CXX_SOURCES bench_general/Gemm_vs_ormqr.cc LINK_LIBS ${Benchmark_libs}) +# BLAS levels performance +add_benchmark(NAME basic_blas_speed CXX_SOURCES bench_general/basic_blas_speed.cc LINK_LIBS ${Benchmark_libs}) # CQRRPT benchmarks add_benchmark(NAME CQRRPT_speed_comparisons CXX_SOURCES bench_CQRRPT/CQRRPT_speed_comparisons.cc LINK_LIBS ${Benchmark_libs}) @@ -96,13 +98,14 @@ add_benchmark(NAME CQRRPT_runtime_breakdown CXX_SOURCES bench_CQRRPT/CQRRPT_runt add_benchmark(NAME CQRRPT_pivot_quality CXX_SOURCES bench_CQRRPT/CQRRPT_pivot_quality.cc LINK_LIBS ${Benchmark_libs}) # CQRRP benchmarks -add_benchmark(NAME CQRRP_speed_comparisons CXX_SOURCES bench_CQRRP/CQRRP_speed_comparisons.cc LINK_LIBS ${Benchmark_libs}) -add_benchmark(NAME CQRRP_runtime_breakdown CXX_SOURCES bench_CQRRP/CQRRP_runtime_breakdown.cc LINK_LIBS ${Benchmark_libs}) -add_benchmark(NAME CQRRP_single_precision CXX_SOURCES bench_CQRRP/CQRRP_single_precision.cc LINK_LIBS ${Benchmark_libs}) -add_benchmark(NAME CQRRP_pivot_quality CXX_SOURCES bench_CQRRP/CQRRP_pivot_quality.cc LINK_LIBS ${Benchmark_libs}) -add_benchmark(NAME HQRRP_runtime_breakdown CXX_SOURCES bench_CQRRP/HQRRP_runtime_breakdown.cc LINK_LIBS ${Benchmark_libs}) -add_benchmark(NAME QR_speed_comp CXX_SOURCES bench_CQRRP/QR_speed_comp.cc LINK_LIBS ${Benchmark_libs}) -add_benchmark(NAME ICQRRP_subroutines_speed CXX_SOURCES bench_CQRRP/ICQRRP_subroutines_speed.cc LINK_LIBS ${Benchmark_libs}) +add_benchmark(NAME CQRRP_speed_comparisons CXX_SOURCES bench_CQRRP/CQRRP_speed_comparisons.cc LINK_LIBS ${Benchmark_libs}) +add_benchmark(NAME CQRRP_runtime_breakdown CXX_SOURCES bench_CQRRP/CQRRP_runtime_breakdown.cc LINK_LIBS ${Benchmark_libs}) +add_benchmark(NAME CQRRP_pivot_quality CXX_SOURCES bench_CQRRP/CQRRP_pivot_quality.cc LINK_LIBS ${Benchmark_libs}) +add_benchmark(NAME HQRRP_runtime_breakdown CXX_SOURCES bench_CQRRP/HQRRP_runtime_breakdown.cc LINK_LIBS ${Benchmark_libs}) +add_benchmark(NAME QR_speed_comp CXX_SOURCES bench_CQRRP/QR_speed_comp.cc LINK_LIBS ${Benchmark_libs}) +add_benchmark(NAME ICQRRP_subroutines_speed CXX_SOURCES bench_CQRRP/ICQRRP_subroutines_speed.cc LINK_LIBS ${Benchmark_libs}) +add_benchmark(NAME CQRRP_decisions_speed_benchmark CXX_SOURCES bench_CQRRP/CQRRP_decisions_speed_benchmark.cc LINK_LIBS ${Benchmark_libs}) + # RBKI benchmarks add_benchmark(NAME RBKI_speed_comparisons CXX_SOURCES bench_RBKI/RBKI_speed_comparisons.cc LINK_LIBS ${Benchmark_libs}) diff --git a/benchmark/bench_CQRRP/CQRRP_decisions_speed_benchmark.cc b/benchmark/bench_CQRRP/CQRRP_decisions_speed_benchmark.cc new file mode 100644 index 00000000..30b20e80 --- /dev/null +++ b/benchmark/bench_CQRRP/CQRRP_decisions_speed_benchmark.cc @@ -0,0 +1,150 @@ +#if defined(__APPLE__) +int main() {return 0;} +#else +#include "RandLAPACK.hh" +#include "rl_blaspp.hh" +#include "rl_lapackpp.hh" +#include "rl_gen.hh" + +#include +#include + +template +struct QR_speed_benchmark_data { + int64_t row; + int64_t col; + T tolerance; + T sampling_factor; + std::vector A; + std::vector tau; + std::vector J; + + QR_speed_benchmark_data(int64_t m, int64_t n, T tol, T d_factor) : + A(m * n, 0.0), + tau(n, 0.0), + J(n, 0) + { + row = m; + col = n; + tolerance = tol; + sampling_factor = d_factor; + } +}; + +// Re-generate and clear data +template +static void data_regen(RandLAPACK::gen::mat_gen_info m_info, + QR_speed_benchmark_data &all_data, + RandBLAS::RNGState &state) { + + RandLAPACK::gen::mat_gen(m_info, all_data.A.data(), state); + std::fill(all_data.tau.begin(), all_data.tau.end(), 0.0); + std::fill(all_data.J.begin(), all_data.J.end(), 0); +} + +template +static void call_all_algs( + RandLAPACK::gen::mat_gen_info m_info, + int64_t numruns, + int64_t b_sz, + QR_speed_benchmark_data &all_data, + RandBLAS::RNGState &state, + std::string output_filename) { + + auto m = all_data.row; + auto n = all_data.col; + auto tol = all_data.tolerance; + auto d_factor = all_data.sampling_factor; + + // Additional params setup. + RandLAPACK::CQRRP_blocked CQRRP_blocked(false, tol, b_sz); + // We are nbot using panel pivoting in performance testing. + int panel_pivoting = 0; + + // timing vars + long dur_cqrrp_cholqr = 0; + long dur_cqrrp_qrf = 0; + + // Making sure the states are unchanged + auto state_gen = state; + auto state_alg = state; + + for (int i = 0; i < numruns; ++i) { + printf("ITERATION %d, NUMCOLS %ld\n", i, n); + + // Testing CQRRP - best setup + CQRRP_blocked.use_qrf = true; + CQRRP_blocked.use_gemqrt = false; + auto start_cqrrp_qrf = high_resolution_clock::now(); + CQRRP_blocked.call(m, n, all_data.A.data(), m, d_factor, all_data.tau.data(), all_data.J.data(), state_alg); + auto stop_cqrrp_qrf = high_resolution_clock::now(); + dur_cqrrp_qrf = duration_cast(stop_cqrrp_qrf - start_cqrrp_qrf).count(); + printf("TOTAL TIME FOR CQRRP_QRF %ld\n", dur_cqrrp_qrf); + + // Making sure the states are unchanged + state_gen = state; + state_alg = state; + // Clear and re-generate data + data_regen(m_info, all_data, state_gen); + + // Testing CQRRP - using QP3 + CQRRP_blocked.use_qrf = false; + CQRRP_blocked.use_gemqrt = true; + auto start_cqrrp_cholqr = high_resolution_clock::now(); + CQRRP_blocked.call(m, n, all_data.A.data(), m, d_factor, all_data.tau.data(), all_data.J.data(), state_alg); + auto stop_cqrrp_cholqr = high_resolution_clock::now(); + dur_cqrrp_cholqr = duration_cast(stop_cqrrp_cholqr - start_cqrrp_cholqr).count(); + printf("TOTAL TIME FOR CQRRP_CHOLQR %ld\n", dur_cqrrp_cholqr); + + // Making sure the states are unchanged + state_gen = state; + state_alg = state; + // Clear and re-generate data + data_regen(m_info, all_data, state_gen); + + std::ofstream file(output_filename, std::ios::app); + file << ", " << dur_cqrrp_cholqr << ", " << dur_cqrrp_qrf << ",\n"; + } +} + +int main(int argc, char *argv[]) { + + if(argc <= 1) { + printf("No input provided\n"); + return 0; + } + + auto size = argv[1]; + + // Declare parameters + int64_t m = std::stol(size); + int64_t n = std::stol(size); + double d_factor = 1.0; + int64_t b_sz_start = 256; + int64_t b_sz_end = 2048; + double tol = std::pow(std::numeric_limits::epsilon(), 0.85); + auto state = RandBLAS::RNGState(); + auto state_constant = state; + // Timing results + std::vector res; + // Number of algorithm runs. We only record best times. + int64_t numruns = 5; + + // Allocate basic workspace + QR_speed_benchmark_data all_data(m, n, tol, d_factor); + // Generate the input matrix - gaussian suffices for performance tests. + RandLAPACK::gen::mat_gen_info m_info(m, n, RandLAPACK::gen::gaussian); + RandLAPACK::gen::mat_gen(m_info, all_data.A.data(), state); + + // Declare a data file + std::string output_filename = "ICQRRP_time_raw_rows_" + std::to_string(m) + + "_cols_" + std::to_string(n) + + "_b_sz_start_" + std::to_string(b_sz_start) + + "_b_sz_end_" + std::to_string(b_sz_end) + + "_d_factor_" + std::to_string(d_factor) + + ".dat"; + for (;b_sz_start <= b_sz_end; b_sz_start *= 2) { + call_all_algs(m_info, numruns, b_sz_start, all_data, state_constant, output_filename); + } +} +#endif diff --git a/benchmark/bench_CQRRP/CQRRP_pivot_quality.cc b/benchmark/bench_CQRRP/CQRRP_pivot_quality.cc index a3e2f9f2..63c1e665 100644 --- a/benchmark/bench_CQRRP/CQRRP_pivot_quality.cc +++ b/benchmark/bench_CQRRP/CQRRP_pivot_quality.cc @@ -178,12 +178,19 @@ static void sv_ratio( data_regen(m_info, all_data, state_gen); } -int main() { +int main(int argc, char *argv[]) { + + if(argc <= 1) { + printf("No input provided\n"); + return 0; + } + auto size = argv[1]; + // Declare parameters - int64_t m = std::pow(2, 10); - int64_t n = std::pow(2, 10); - double d_factor = 1.25; - int64_t b_sz = 256; + int64_t m = std::stol(size); + int64_t n = std::stol(size); + double d_factor = 1.0; + int64_t b_sz = n; double tol = std::pow(std::numeric_limits::epsilon(), 0.85); auto state = RandBLAS::RNGState(); auto state_constant1 = state; @@ -195,13 +202,25 @@ int main() { // Allocate basic workspace QR_speed_benchmark_data all_data(m, n, tol, d_factor); // Generate the input matrix - gaussian suffices for performance tests. - RandLAPACK::gen::mat_gen_info m_info(m, n, RandLAPACK::gen::spiked); + RandLAPACK::gen::mat_gen_info m_info(m, n, RandLAPACK::gen::polynomial); m_info.cond_num = std::pow(10, 10); m_info.rank = n; m_info.exponent = 2.0; m_info.scaling = std::pow(10, 10); RandLAPACK::gen::mat_gen(m_info, all_data.A.data(), state); + std::fstream file("A_generated_rows_" + std::to_string(m) + + "_cols_" + std::to_string(n) + + "_b_sz_" + std::to_string(b_sz) + + "_d_factor_" + std::to_string(d_factor) + + ".dat", std::fstream::app); + for (int i = 0; i < n ; ++i){ + for (int j = 0; j < m ; ++j){ + file << all_data.A[m * i + j] << ", "; + } + file << "\n"; + } + R_norm_ratio(m_info, b_sz, all_data, state_constant1); printf("R done\n"); sv_ratio(m_info, b_sz, all_data, state_constant2); diff --git a/benchmark/bench_CQRRP/CQRRP_runtime_breakdown.cc b/benchmark/bench_CQRRP/CQRRP_runtime_breakdown.cc index edb1fa5b..28f7bc57 100644 --- a/benchmark/bench_CQRRP/CQRRP_runtime_breakdown.cc +++ b/benchmark/bench_CQRRP/CQRRP_runtime_breakdown.cc @@ -81,7 +81,7 @@ static void call_all_algs( std::vector inner_timing; for (int i = 0; i < numruns; ++i) { - printf("ITERATION\n"); + printf("ITERATION %d, NUMCOLS %ld\n", i, n); // Testing CQRRP - best setuo CQRRP_blocked.call(m, n, all_data.A.data(), m, d_factor, all_data.tau.data(), all_data.J.data(), state_alg); @@ -100,11 +100,18 @@ static void call_all_algs( } } -int main() { +int main(int argc, char *argv[]) { + + if(argc <= 1) { + printf("No input provided\n"); + return 0; + } + auto size = argv[1]; + // Declare parameters - int64_t m = std::pow(2, 14); - int64_t n = std::pow(2, 14); - double d_factor = 1.25; + int64_t m = std::stol(size); + int64_t n = std::stol(size); + double d_factor = 1.0; int64_t b_sz_start = 256; int64_t b_sz_end = 2048; double tol = std::pow(std::numeric_limits::epsilon(), 0.85); @@ -113,7 +120,7 @@ int main() { // Timing results std::vector res; // Number of algorithm runs. We only record best times. - int64_t numruns = 10; + int64_t numruns = 3; // Allocate basic workspace QR_speed_benchmark_data all_data(m, n, tol, d_factor); @@ -122,7 +129,7 @@ int main() { RandLAPACK::gen::mat_gen(m_info, all_data.A.data(), state); // Declare a data file - std::string file= "CQRRP_inner_speed_" + std::to_string(m) + std::string file= "CQRRP_runtime_breakdown_" + std::to_string(m) + "_cols_" + std::to_string(n) + "_b_sz_start_" + std::to_string(b_sz_start) + "_b_sz_end_" + std::to_string(b_sz_end) diff --git a/benchmark/bench_CQRRP/CQRRP_single_precision.cc b/benchmark/bench_CQRRP/CQRRP_single_precision.cc deleted file mode 100644 index 720abdc2..00000000 --- a/benchmark/bench_CQRRP/CQRRP_single_precision.cc +++ /dev/null @@ -1,170 +0,0 @@ -#if defined(__APPLE__) -int main() {return 0;} -#else -/* -This benchmarks compares single-precision ICQRRP with double-precision GETRF and GEQRF. -We anticipate that single-precision ICQRRP can be used as part of the linear system solving process. -*/ -#include "RandLAPACK.hh" -#include "rl_blaspp.hh" -#include "rl_lapackpp.hh" -#include "rl_gen.hh" - -#include -#include - -template -struct QR_speed_benchmark_data { - int64_t row; - int64_t col; - T tolerance; - T sampling_factor; - std::vector A; - std::vector tau; - std::vector J; - - QR_speed_benchmark_data(int64_t m, int64_t n, T tol, T d_factor) : - A(m * n, 0.0), - tau(n, 0.0), - J(n, 0) - { - row = m; - col = n; - tolerance = tol; - sampling_factor = d_factor; - } -}; - -// Re-generate and clear data -template -static void data_regen(RandLAPACK::gen::mat_gen_info m_info, - QR_speed_benchmark_data &all_data, - RandBLAS::RNGState &state, int apply_itoa) { - - RandLAPACK::gen::mat_gen(m_info, all_data.A.data(), state); - std::fill(all_data.tau.begin(), all_data.tau.end(), 0.0); - if (apply_itoa) { - std::iota(all_data.J.begin(), all_data.J.end(), 1); - } else { - std::fill(all_data.J.begin(), all_data.J.end(), 0); - } -} - -template -static std::vector call_all_algs( - RandLAPACK::gen::mat_gen_info m_info_cqrrp, - RandLAPACK::gen::mat_gen_info m_info_rest, - int64_t numruns, - int64_t b_sz, - QR_speed_benchmark_data &all_data_cqrrp, - QR_speed_benchmark_data &all_data_rest, - RandBLAS::RNGState &state) { - - auto m = all_data_cqrrp.row; - auto n = all_data_cqrrp.col; - auto tol = all_data_cqrrp.tolerance; - auto d_factor = all_data_cqrrp.sampling_factor; - - // Additional params setup. - RandLAPACK::CQRRP_blocked CQRRP_blocked(false, tol, b_sz); - // We are nbot using panel pivoting in performance testing. - // timing vars - long dur_cqrrp = 0; - long dur_geqrf = 0; - long dur_getrf = 0; - long t_cqrrp_best = 0; - long t_geqrf_best = 0; - long t_getrf_best = 0; - - // Making sure the states are unchanged - auto state_gen = state; - auto state_alg = state; - - for (int i = 0; i < numruns; ++i) { - printf("ITERATION\n"); - // Testing GEQRF - auto start_getrf = high_resolution_clock::now(); - lapack::getrf(m, n, all_data_rest.A.data(), m, all_data_rest.J.data()); - auto stop_getrf = high_resolution_clock::now(); - dur_getrf = duration_cast(stop_getrf - start_getrf).count(); - printf("TOTAL TIME FOR GETRF %ld\n", dur_getrf); - // Update best timing - i == 0 ? t_getrf_best = dur_getrf : (dur_getrf < t_getrf_best) ? t_getrf_best = dur_getrf : NULL; - - data_regen(m_info_rest, all_data_rest, state_gen, 0); - state_gen = state; - - // Testing GEQRF - auto start_geqrf = high_resolution_clock::now(); - lapack::geqrf(m, n, all_data_rest.A.data(), m, all_data_rest.tau.data()); - auto stop_geqrf = high_resolution_clock::now(); - dur_geqrf = duration_cast(stop_geqrf - start_geqrf).count(); - printf("TOTAL TIME FOR GEQRF %ld\n", dur_geqrf); - // Update best timing - i == 0 ? t_geqrf_best = dur_geqrf : (dur_geqrf < t_geqrf_best) ? t_geqrf_best = dur_geqrf : NULL; - - // Clear and re-generate data - data_regen(m_info_rest, all_data_rest, state_gen, 0); - state_gen = state; - - // Testing CQRRP - best setup - auto start_cqrrp = high_resolution_clock::now(); - CQRRP_blocked.call(m, n, all_data_cqrrp.A.data(), m, d_factor, all_data_cqrrp.tau.data(), all_data_cqrrp.J.data(), state_alg); - auto stop_cqrrp = high_resolution_clock::now(); - dur_cqrrp = duration_cast(stop_cqrrp - start_cqrrp).count(); - printf("TOTAL TIME FOR CQRRP %ld\n", dur_cqrrp); - // Update best timing - i == 0 ? t_cqrrp_best = dur_cqrrp : (dur_cqrrp < t_cqrrp_best) ? t_cqrrp_best = dur_cqrrp : NULL; - - // Clear and re-generate data - data_regen(m_info_cqrrp, all_data_cqrrp, state_gen, 1); - state_gen = state; - state_alg = state; - } - - std::vector res{t_cqrrp_best, t_geqrf_best, t_getrf_best}; - - return res; -} - -int main() { - // Declare parameters - int64_t m = std::pow(2, 14); - int64_t n = std::pow(2, 14); - double d_factor = 1.25; - int64_t b_sz_start = 256; - int64_t b_sz_end = 2048; - double tol = std::pow(std::numeric_limits::epsilon(), 0.85); - auto state = RandBLAS::RNGState(); - auto state_cpy = state; - auto state_constant = state; - // Timing results - std::vector res; - // Number of algorithm runs. We only record best times. - int64_t numruns = 5; - - // Allocate basic workspace - double - QR_speed_benchmark_data all_data_d(m, n, tol, d_factor); - // Generate the input matrix - gaussian suffices for performance tests. - RandLAPACK::gen::mat_gen_info m_info_d(m, n, RandLAPACK::gen::gaussian); - RandLAPACK::gen::mat_gen(m_info_d, all_data_d.A.data(), state); - - // Allocate basic workspace - float - QR_speed_benchmark_data all_data_f(m, n, (float) tol, (float) d_factor); - // Generate the input matrix - gaussian suffices for performance tests. - RandLAPACK::gen::mat_gen_info m_info_f(m, n, RandLAPACK::gen::gaussian); - RandLAPACK::gen::mat_gen(m_info_f, all_data_f.A.data(), state_cpy); - - // Declare a data file - std::fstream file("Apple_QR_time_raw_rows_" + std::to_string(m) - + "_cols_" + std::to_string(n) - + "_b_sz_start_" + std::to_string(b_sz_start) - + "_b_sz_end_" + std::to_string(b_sz_end) - + "_d_factor_" + std::to_string(d_factor) - + ".dat", std::fstream::app); - for (;b_sz_start <= b_sz_end; b_sz_start *= 2) { - res = call_all_algs(m_info_f, m_info_d, numruns, b_sz_start, all_data_f, all_data_d, state_constant); - file << res[0] << ", " << res[1] << ", " << res[2] << ",\n"; - } -} -#endif diff --git a/benchmark/bench_CQRRP/CQRRP_speed_comparisons.cc b/benchmark/bench_CQRRP/CQRRP_speed_comparisons.cc index 5004e7c0..3c06d9bc 100644 --- a/benchmark/bench_CQRRP/CQRRP_speed_comparisons.cc +++ b/benchmark/bench_CQRRP/CQRRP_speed_comparisons.cc @@ -90,7 +90,7 @@ static void call_all_algs( auto state_alg = state; for (int i = 0; i < numruns; ++i) { - printf("\nITERATION %d\n", i); + printf("ITERATION %d, NUMCOLS %ld\n", i, n); // Testing GEQRF auto start_geqrf = high_resolution_clock::now(); @@ -201,7 +201,7 @@ int main(int argc, char *argv[]) { // Declare parameters int64_t m = std::stol(size); int64_t n = std::stol(size); - double d_factor = 1.25; + double d_factor = 1.0; int64_t b_sz_start = 256; int64_t b_sz_end = 2048; double tol = std::pow(std::numeric_limits::epsilon(), 0.85); @@ -210,7 +210,7 @@ int main(int argc, char *argv[]) { // Timing results std::vector res; // Number of algorithm runs. We only record best times. - int64_t numruns = 10; + int64_t numruns = 3; // Allocate basic workspace QR_speed_benchmark_data all_data(m, n, tol, d_factor); diff --git a/benchmark/bench_CQRRP/HQRRP_runtime_breakdown.cc b/benchmark/bench_CQRRP/HQRRP_runtime_breakdown.cc index 8ff0019c..8df6aba3 100644 --- a/benchmark/bench_CQRRP/HQRRP_runtime_breakdown.cc +++ b/benchmark/bench_CQRRP/HQRRP_runtime_breakdown.cc @@ -76,7 +76,7 @@ static void call_all_algs( T* times = ( T * ) calloc(29, sizeof( T ) ); for (int i = 0; i < numruns; ++i) { - printf("Iteration %d start.\n", i); + printf("ITERATION %d, NUMCOLS %ld\n", i, n); // Testing HQRRP // No CholQR @@ -101,14 +101,13 @@ int main(int argc, char *argv[]) { printf("No input provided\n"); return 0; } - auto size = argv[1]; // Declare parameters int64_t m = std::stol(size); int64_t n = std::stol(size); double d_factor = 1.0; - int64_t b_sz_start = 256; + int64_t b_sz_start = 32; int64_t b_sz_end = 2048; double tol = std::pow(std::numeric_limits::epsilon(), 0.85); auto state = RandBLAS::RNGState(); @@ -136,4 +135,4 @@ int main(int argc, char *argv[]) { call_all_algs(m_info, numruns, b_sz_start, all_data, state_constant, file); } } -#endif \ No newline at end of file +#endif diff --git a/benchmark/bench_CQRRP/ICQRRP_subroutines_speed.cc b/benchmark/bench_CQRRP/ICQRRP_subroutines_speed.cc index 7782ad7b..be8e8f17 100644 --- a/benchmark/bench_CQRRP/ICQRRP_subroutines_speed.cc +++ b/benchmark/bench_CQRRP/ICQRRP_subroutines_speed.cc @@ -332,8 +332,8 @@ int main(int argc, char *argv[]) { int64_t i = 0; // Declare parameters int64_t m = std::stol(size); - int64_t n_start = 256; - int64_t n_stop = 1024; + int64_t n_start = 32; + int64_t n_stop = 2048; int64_t nb_start = 32; auto state = RandBLAS::RNGState(); auto state_B = RandBLAS::RNGState(); diff --git a/benchmark/bench_CQRRP/QR_speed_comp.cc b/benchmark/bench_CQRRP/QR_speed_comp.cc index 5c577e6e..ed39d91b 100644 --- a/benchmark/bench_CQRRP/QR_speed_comp.cc +++ b/benchmark/bench_CQRRP/QR_speed_comp.cc @@ -85,7 +85,7 @@ static void call_all_algs( auto state_gen = state; for (int i = 0; i < numruns; ++i) { - printf("Iteration %d start.\n", i); + printf("ITERATION %d, NUMCOLS %ld\n", i, n); // Testing GEQRF auto start_geqrf = high_resolution_clock::now(); lapack::geqrf(m, n, all_data.A.data(), m, all_data.tau.data()); @@ -145,7 +145,7 @@ static void call_all_algs( int main() { // Declare parameters int64_t m = std::pow(2, 17); - int64_t n_start = std::pow(2, 9); + int64_t n_start = std::pow(2, 8); int64_t n_stop = std::pow(2, 13); auto state = RandBLAS::RNGState(); auto state_constant = state; diff --git a/benchmark/bench_CQRRP/results/EPYC-7513x2/CQRRP_runtime_breakdown_16384_cols_16384_b_sz_start_256_b_sz_end_2048_d_factor_1.000000.dat b/benchmark/bench_CQRRP/results/EPYC-7513x2/CQRRP_runtime_breakdown_16384_cols_16384_b_sz_start_256_b_sz_end_2048_d_factor_1.000000.dat new file mode 100644 index 00000000..653e090c --- /dev/null +++ b/benchmark/bench_CQRRP/results/EPYC-7513x2/CQRRP_runtime_breakdown_16384_cols_16384_b_sz_start_256_b_sz_end_2048_d_factor_1.000000.dat @@ -0,0 +1,12 @@ +181366, 243, 1650345, 9768521, 1212746, 1020709, 8885592, 235955, 389479, 0, 1216066, 23348276, +156984, 443, 1641501, 8352403, 1199292, 1059717, 8616565, 247962, 398244, 0, 1203492, 21677311, +153211, 490, 1670654, 7662616, 1209118, 981318, 8356764, 245481, 404787, 0, 1222361, 20697682, +288897, 2439, 1759922, 3594576, 1945487, 1188277, 8233515, 371372, 393981, 0, 1950132, 17783111, +305643, 1826, 1763243, 3622106, 1951734, 1088138, 8161927, 376738, 409825, 0, 1953370, 17682816, +299563, 2126, 1742891, 3578718, 1946758, 1085813, 8198881, 374208, 414490, 0, 1949126, 17645816, +520984, 4487, 2989711, 2207920, 2465014, 1681940, 7589539, 493286, 419811, 0, 2465834, 18373512, +524284, 3789, 2984481, 2182578, 2475591, 1677777, 7509439, 492175, 424433, 0, 2476440, 18275396, +516907, 6079, 3040611, 2157762, 2459327, 1652034, 7625658, 485774, 423102, 0, 2460161, 18368088, +842259, 6842, 9014976, 2084637, 3070767, 2431451, 6192576, 686528, 461902, 0, 3071242, 24792413, +753773, 4252, 9060596, 2121501, 3117551, 2445025, 5892123, 664491, 448198, 0, 3118034, 24507993, +751137, 3066, 9197242, 2243651, 3107618, 2458725, 5866643, 709468, 451175, 0, 3108098, 24789205, diff --git a/benchmark/bench_CQRRP/results/EPYC-7513x2/CQRRP_runtime_breakdown_32768_cols_32768_b_sz_start_256_b_sz_end_2048_d_factor_1.000000.dat b/benchmark/bench_CQRRP/results/EPYC-7513x2/CQRRP_runtime_breakdown_32768_cols_32768_b_sz_start_256_b_sz_end_2048_d_factor_1.000000.dat new file mode 100644 index 00000000..a416abe3 --- /dev/null +++ b/benchmark/bench_CQRRP/results/EPYC-7513x2/CQRRP_runtime_breakdown_32768_cols_32768_b_sz_start_256_b_sz_end_2048_d_factor_1.000000.dat @@ -0,0 +1,12 @@ +554620, 275, 4801215, 36561965, 2393351, 2072318, 36135898, 703920, 1365058, 0, 2405967, 84601236, +443497, 545, 4759373, 46144249, 2390063, 2033555, 36239197, 709092, 1404938, 0, 2405353, 94139799, +442742, 438, 4757143, 52880884, 2394578, 2018381, 41995878, 737673, 1431287, 0, 2408666, 106673092, +797717, 2338, 4888568, 21108732, 4213535, 2335011, 38951294, 1119949, 1430083, 0, 4222349, 74856041, +788249, 1261, 4962434, 20826139, 4021795, 2401959, 35446640, 1042935, 1380978, 0, 4030697, 70881292, +811570, 508, 4989596, 21025691, 4057179, 2384313, 35142772, 1012780, 1359650, 0, 4065535, 70792415, +1520972, 5589, 7766328, 12051112, 6347976, 2812414, 33822228, 1209494, 1379689, 0, 6351121, 66918947, +1505262, 4291, 7923582, 11417931, 6555946, 2823460, 33964946, 1228288, 1390404, 0, 6559121, 66817285, +1507168, 5728, 7955134, 11108034, 6381134, 2796127, 33600483, 1277268, 1382880, 0, 6384253, 66017075, +2821554, 229, 15855113, 9438847, 6171233, 4218395, 31526454, 2416167, 1542838, 0, 6172951, 73992548, +2831265, 144, 15904247, 11087887, 6045482, 4159464, 31548864, 2288441, 1545727, 0, 6047040, 75413079, +2858958, 187, 15667530, 11530858, 6183599, 4266160, 31836552, 2503869, 1563467, 0, 6185480, 76413061, diff --git a/benchmark/bench_CQRRP/results/EPYC-7513x2/CQRRP_runtime_breakdown_65536_cols_65536_b_sz_start_256_b_sz_end_2048_d_factor_1.000000.dat b/benchmark/bench_CQRRP/results/EPYC-7513x2/CQRRP_runtime_breakdown_65536_cols_65536_b_sz_start_256_b_sz_end_2048_d_factor_1.000000.dat new file mode 100644 index 00000000..3b5a082f --- /dev/null +++ b/benchmark/bench_CQRRP/results/EPYC-7513x2/CQRRP_runtime_breakdown_65536_cols_65536_b_sz_start_256_b_sz_end_2048_d_factor_1.000000.dat @@ -0,0 +1,12 @@ +2070599, 692, 15664347, 310437863, 14441311, 4808586, 291631803, 2295516, 5300342, 0, 14481032, 646690780, +1786608, 359, 15748943, 288052401, 14278324, 4817768, 284422223, 2280434, 5322967, 0, 14318388, 616750091, +1765226, 501, 15798463, 291734677, 14260280, 4757044, 284576899, 2273670, 5306752, 0, 14301141, 620514373, +3184937, 1032, 17445941, 157456876, 21431946, 5442476, 281733938, 3967288, 5140808, 0, 21457285, 495830581, +3209533, 1433, 17324466, 159092380, 21136383, 5466374, 278820824, 3810630, 5011585, 0, 21158342, 493895567, +3194814, 1579, 17379494, 162823361, 20825415, 5515106, 277643672, 3713321, 5017174, 0, 20847027, 496135548, +5978063, 6104, 27084429, 82999359, 25988182, 6485481, 270645520, 5269194, 5172538, 0, 26000811, 429641499, +5968769, 6345, 27158872, 83515567, 25647406, 6535865, 267927888, 5183162, 5046135, 0, 25659901, 427002504, +6018893, 5648, 27436905, 85076351, 25283068, 6606931, 268233812, 5161290, 5055815, 0, 25295915, 428891560, +11297560, 226, 37104856, 79536844, 14696104, 9719569, 258332365, 8839961, 5384560, 0, 14703010, 424918951, +11292363, 505, 37787799, 56152169, 14397660, 9814167, 258557037, 8604084, 5312554, 0, 14404722, 401925400, +11228293, 614, 37523218, 52449354, 14211896, 9721546, 258874073, 8793977, 5425619, 0, 14218300, 398234994, diff --git a/benchmark/bench_CQRRP/results/EPYC-7513x2/CQRRP_runtime_breakdown_8192_cols_8192_b_sz_start_256_b_sz_end_2048_d_factor_1.000000.dat b/benchmark/bench_CQRRP/results/EPYC-7513x2/CQRRP_runtime_breakdown_8192_cols_8192_b_sz_start_256_b_sz_end_2048_d_factor_1.000000.dat new file mode 100644 index 00000000..40cdba68 --- /dev/null +++ b/benchmark/bench_CQRRP/results/EPYC-7513x2/CQRRP_runtime_breakdown_8192_cols_8192_b_sz_start_256_b_sz_end_2048_d_factor_1.000000.dat @@ -0,0 +1,12 @@ +89990, 226, 611665, 1344960, 433099, 547049, 1633030, 78357, 113763, 0, 433979, 4853019, +46778, 7072, 618468, 1657501, 426850, 508590, 1642473, 74827, 116722, 0, 427749, 5100180, +46431, 8687, 621269, 1610307, 426854, 517256, 1658861, 69239, 118028, 0, 427765, 5077843, +101052, 788, 718103, 680499, 724208, 602902, 1449776, 133063, 120996, 0, 724699, 4531878, +97449, 1382, 699928, 700428, 717549, 595135, 1448996, 127295, 120263, 0, 718039, 4508915, +99561, 1384, 714902, 690043, 723512, 612556, 1434440, 128941, 122808, 0, 724005, 4528640, +165500, 5565, 1593647, 571756, 888192, 940076, 1294366, 158534, 138707, 0, 888450, 5756601, +164306, 2359, 1595922, 621254, 885277, 974801, 1323147, 163232, 127135, 0, 885543, 5857699, +162799, 2740, 1570846, 578224, 892729, 940003, 1281984, 158756, 130680, 0, 892987, 5719019, +215838, 59, 1815055, 746899, 1401877, 1409257, 1053900, 241782, 168386, 0, 1402027, 7053203, +217363, 50, 1762140, 735344, 1424821, 1404318, 1096215, 255700, 170058, 0, 1424971, 7066159, +297705, 142, 1774773, 733591, 1423101, 1391762, 1110542, 252336, 172414, 0, 1423236, 7156501, diff --git a/benchmark/bench_CQRRP/results/EPYC-7513x2/HQRRP_runtime_breakdown_16384_cols_16384_b_sz_start_32_b_sz_end_2048_d_factor_1.000000.dat b/benchmark/bench_CQRRP/results/EPYC-7513x2/HQRRP_runtime_breakdown_16384_cols_16384_b_sz_start_32_b_sz_end_2048_d_factor_1.000000.dat new file mode 100644 index 00000000..2b989f3e --- /dev/null +++ b/benchmark/bench_CQRRP/results/EPYC-7513x2/HQRRP_runtime_breakdown_16384_cols_16384_b_sz_start_32_b_sz_end_2048_d_factor_1.000000.dat @@ -0,0 +1,35 @@ +16384, 32, 124, 77235, 0, 8.16952e+06, 3.60095e+06, 3.35696e+07, 1.05527e+06, 47996, 4.65207e+07, 20251, 406576, 2.14202e+06, 3232, 1.72162e+06, 3.83282e+06, 0, 42705, 8.16922e+06, 1326, 0, 0, 330522, 1.04307e+06, 3, 2.20478e+06, 21096, 3.6008e+06, +16384, 32, 4356, 39881, 0, 7.60175e+06, 3.51221e+06, 3.29618e+07, 973960, 63782, 4.51578e+07, 21088, 404991, 2.16462e+06, 2943, 1.66706e+06, 3.29827e+06, 0, 42482, 7.60146e+06, 1455, 0, 1, 318841, 942297, 41, 2.2284e+06, 21053, 3.51208e+06, +16384, 32, 4405, 40111, 0, 7.19966e+06, 3.47538e+06, 3.28972e+07, 1.12194e+06, 64974, 4.48037e+07, 20284, 405139, 2.24006e+06, 3125, 1.68996e+06, 2.79989e+06, 0, 40913, 7.19938e+06, 1753, 0, 0, 274118, 937430, 1, 2.24092e+06, 21038, 3.47526e+06, +16384, 32, 4275, 39188, 0, 7.65085e+06, 3.58519e+06, 3.36003e+07, 1.1768e+06, 77093, 4.61337e+07, 20299, 406978, 2.18023e+06, 2966, 1.6853e+06, 3.31388e+06, 0, 40916, 7.65057e+06, 1605, 0, 0, 316322, 956455, 1, 2.28958e+06, 21092, 3.58505e+06, +16384, 32, 5074, 32822, 0, 7.94857e+06, 3.59578e+06, 3.41931e+07, 1.18153e+06, 77287, 4.70342e+07, 21294, 408885, 2.14442e+06, 2961, 1.71752e+06, 3.61248e+06, 0, 40696, 7.94826e+06, 1791, 0, 10, 335333, 999552, 0, 2.23789e+06, 21087, 3.59567e+06, +16384, 64, 3747, 54072, 0, 9.37098e+06, 4.5213e+06, 2.44545e+07, 1.24752e+06, 48179, 3.97003e+07, 9659, 321317, 1.82939e+06, 2259, 2.79824e+06, 4.36827e+06, 0, 41696, 9.37084e+06, 1056, 0, 0, 420839, 1.80809e+06, 5, 2.27057e+06, 20665, 4.52122e+06, +16384, 64, 5149, 38338, 0, 9.24158e+06, 4.53348e+06, 2.40398e+07, 1.22738e+06, 47791, 3.91335e+07, 9870, 319711, 1.89738e+06, 2294, 2.76804e+06, 4.20335e+06, 0, 40780, 9.24143e+06, 1002, 0, 0, 401646, 1.80528e+06, 0, 2.30476e+06, 20714, 4.53341e+06, +16384, 64, 5292, 36562, 0, 8.77516e+06, 4.46294e+06, 2.41861e+07, 1.21964e+06, 48110, 3.87338e+07, 9286, 314438, 1.96258e+06, 2649, 2.79119e+06, 3.65395e+06, 0, 40938, 8.77503e+06, 1115, 0, 0, 361071, 1.7753e+06, 7, 2.3048e+06, 20562, 4.46286e+06, +16384, 64, 3857, 54470, 0, 8.61326e+06, 4.58717e+06, 2.41125e+07, 1.23043e+06, 49753, 3.86514e+07, 10839, 361674, 1.87979e+06, 2264, 2.84689e+06, 3.47053e+06, 0, 41126, 8.61311e+06, 1041, 0, 4, 349935, 1.88966e+06, 0, 2.32597e+06, 20485, 4.5871e+06, +16384, 64, 5317, 38295, 0, 9.39593e+06, 4.577e+06, 2.37452e+07, 1.19589e+06, 45153, 3.90027e+07, 9198, 312397, 1.96704e+06, 3334, 2.92893e+06, 4.13343e+06, 0, 41482, 9.39581e+06, 940, 0, 0, 379441, 1.83398e+06, 0, 2.34199e+06, 20576, 4.57693e+06, +16384, 128, 4864, 81378, 0, 1.10913e+07, 5.8063e+06, 1.88501e+07, 1.02112e+06, 26197, 3.68812e+07, 6373, 269867, 1.81477e+06, 1402, 4.60312e+06, 4.35671e+06, 0, 38916, 1.10912e+07, 585, 0, 0, 419883, 3.38163e+06, 0, 1.98373e+06, 20426, 5.80626e+06, +16384, 128, 5899, 83981, 0, 1.01333e+07, 5.5589e+06, 1.85453e+07, 934086, 23654, 3.52851e+07, 6777, 267416, 1.83147e+06, 1462, 4.4915e+06, 3.49592e+06, 0, 38665, 1.01332e+07, 596, 0, 0, 283574, 3.42031e+06, 0, 1.83403e+06, 20365, 5.55888e+06, +16384, 128, 8392, 78510, 0, 1.04732e+07, 5.74978e+06, 1.85163e+07, 955901, 27755, 3.58099e+07, 6109, 265928, 1.8844e+06, 2081, 4.54188e+06, 3.73419e+06, 0, 38562, 1.04731e+07, 557, 0, 0, 376976, 3.39459e+06, 1, 1.95717e+06, 20446, 5.74974e+06, +16384, 128, 8547, 56321, 0, 1.01319e+07, 5.68868e+06, 1.82842e+07, 1.03038e+06, 27488, 3.52275e+07, 5114, 246164, 1.88585e+06, 3019, 4.59373e+06, 3.35958e+06, 0, 38318, 1.01318e+07, 597, 0, 7, 339215, 3.34722e+06, 0, 1.98116e+06, 20434, 5.68863e+06, +16384, 128, 8821, 55800, 0, 1.06146e+07, 5.88489e+06, 1.84624e+07, 994386, 28080, 3.6049e+07, 5770, 258608, 1.82332e+06, 2525, 4.62874e+06, 3.85631e+06, 0, 39242, 1.06145e+07, 631, 0, 2, 392206, 3.4938e+06, 7, 1.97782e+06, 20380, 5.88485e+06, +16384, 256, 8253, 95276, 0, 1.58883e+07, 8.88919e+06, 1.6751e+07, 1.21509e+06, 14959, 4.28621e+07, 2243, 235574, 2.31977e+06, 5959, 9.43714e+06, 3.84876e+06, 0, 38800, 1.58882e+07, 461, 0, 1, 344090, 6.3417e+06, 4, 2.18271e+06, 20165, 8.88913e+06, +16384, 256, 8607, 150923, 0, 1.67295e+07, 8.9228e+06, 1.67394e+07, 1.2234e+06, 19580, 4.37942e+07, 2925, 217611, 2.31765e+06, 6778, 9.51755e+06, 4.62705e+06, 0, 39906, 1.67295e+07, 384, 0, 0, 388784, 6.31278e+06, 2, 2.20052e+06, 20283, 8.92276e+06, +16384, 256, 8787, 139374, 0, 1.56205e+07, 8.65764e+06, 1.66125e+07, 1.2127e+06, 18409, 4.22698e+07, 3383, 236655, 2.37539e+06, 7351, 9.71054e+06, 3.24886e+06, 0, 38223, 1.56204e+07, 403, 0, 2, 297289, 6.17394e+06, 0, 2.16561e+06, 20339, 8.65759e+06, +16384, 256, 8169, 147008, 0, 1.61012e+07, 8.48308e+06, 1.69882e+07, 1.23889e+06, 19096, 4.29856e+07, 3972, 234686, 2.34282e+06, 5908, 9.20756e+06, 4.26787e+06, 0, 38295, 1.61011e+07, 464, 0, 1, 340665, 5.98364e+06, 0, 2.13814e+06, 20117, 8.48303e+06, +16384, 256, 9797, 140217, 0, 1.76576e+07, 8.67445e+06, 1.66709e+07, 1.23238e+06, 19113, 4.44045e+07, 3459, 219774, 2.34172e+06, 6971, 9.57255e+06, 5.47454e+06, 0, 38554, 1.76576e+07, 464, 0, 0, 347812, 6.1325e+06, 1, 2.1735e+06, 20133, 8.67441e+06, +16384, 512, 10475, 285008, 0, 4.51089e+07, 2.23973e+07, 1.66194e+07, 1.76891e+06, 23263, 8.62133e+07, 833, 208279, 2.43732e+06, 23250, 3.78829e+07, 4.51724e+06, 0, 39046, 4.51089e+07, 316, 0, 2, 366227, 1.96046e+07, 0, 2.40402e+06, 22118, 2.23973e+07, +16384, 512, 8919, 284301, 0, 4.5389e+07, 2.23414e+07, 1.64637e+07, 1.78812e+06, 23444, 8.6299e+07, 984, 223855, 2.4634e+06, 23196, 3.83944e+07, 4.24406e+06, 0, 39170, 4.5389e+07, 317, 0, 4, 364392, 1.9543e+07, 0, 2.41168e+06, 22048, 2.23414e+07, +16384, 512, 11173, 291525, 0, 4.43303e+07, 2.23562e+07, 1.64949e+07, 1.77437e+06, 23114, 8.52815e+07, 589, 207092, 2.36165e+06, 24111, 3.74009e+07, 4.29676e+06, 0, 39095, 4.43302e+07, 320, 0, 3, 372887, 1.96266e+07, 0, 2.33436e+06, 22050, 2.23562e+07, +16384, 512, 10768, 284217, 0, 4.39348e+07, 2.21069e+07, 1.63104e+07, 1.76463e+06, 21809, 8.44334e+07, 581, 210387, 2.40648e+06, 23051, 3.6994e+07, 4.26123e+06, 0, 39050, 4.39347e+07, 327, 0, 0, 358389, 1.93274e+07, 4, 2.39874e+06, 22048, 2.21069e+07, +16384, 512, 10740, 279337, 0, 4.30934e+07, 2.22506e+07, 1.63844e+07, 1.76813e+06, 22309, 8.38089e+07, 602, 211015, 2.4281e+06, 23189, 3.64817e+07, 3.91046e+06, 0, 38297, 4.30933e+07, 345, 0, 1, 354395, 1.94548e+07, 1, 2.4191e+06, 21968, 2.22506e+07, +16384, 1024, 1035, 518599, 0, 8.17013e+07, 5.40298e+07, 1.69859e+07, 2.89019e+06, 40170, 1.56167e+08, 216, 203133, 2.41064e+06, 37561, 7.50822e+07, 3.92874e+06, 0, 38800, 8.17013e+07, 183, 0, 3, 377071, 5.04952e+07, 0, 3.13302e+06, 24275, 5.40298e+07, +16384, 1024, 1111, 516494, 0, 7.95755e+07, 5.55996e+07, 1.71203e+07, 2.86289e+06, 44593, 1.55721e+08, 237, 201327, 2.4531e+06, 36385, 7.30687e+07, 3.77742e+06, 0, 38352, 7.95755e+07, 179, 0, 3, 373592, 5.20421e+07, 8, 3.15934e+06, 24332, 5.55996e+07, +16384, 1024, 1291, 526029, 0, 7.92133e+07, 5.42631e+07, 1.71009e+07, 2.90057e+06, 40885, 1.54046e+08, 248, 203888, 2.35575e+06, 36964, 7.27733e+07, 3.80484e+06, 0, 38265, 7.92133e+07, 174, 0, 0, 384720, 5.07063e+07, 0, 3.14771e+06, 24233, 5.42631e+07, +16384, 1024, 1318, 518284, 0, 8.18863e+07, 5.26864e+07, 1.68283e+07, 2.85885e+06, 42974, 1.54823e+08, 271, 207188, 2.32453e+06, 38893, 7.50508e+07, 4.22601e+06, 0, 38643, 8.18863e+07, 179, 0, 5, 410959, 4.9028e+07, 0, 3.22301e+06, 24234, 5.26864e+07, +16384, 1024, 1541, 397354, 0, 7.94591e+07, 5.50904e+07, 1.67145e+07, 2.8735e+06, 42450, 1.54579e+08, 208, 204158, 2.354e+06, 38469, 7.26951e+07, 4.12859e+06, 0, 38592, 7.94591e+07, 205, 0, 3, 400618, 5.15159e+07, 1, 3.14939e+06, 24271, 5.50904e+07, +16384, 2048, 8366, 964792, 0, 1.3349e+08, 1.10944e+08, 1.69939e+07, 4.14909e+06, 33893, 2.66584e+08, 121, 201618, 2.29764e+06, 51694, 1.27178e+08, 3.72562e+06, 0, 35754, 1.3349e+08, 95, 0, 2, 397795, 1.04819e+08, 6, 5.70166e+06, 25732, 1.10944e+08, +16384, 2048, 4438, 747090, 0, 1.31715e+08, 1.08729e+08, 1.69656e+07, 4.28107e+06, 30544, 2.62473e+08, 167, 200017, 2.33017e+06, 50566, 1.25343e+08, 3.75603e+06, 0, 35567, 1.31715e+08, 103, 0, 2, 377137, 1.02709e+08, 0, 5.61734e+06, 25741, 1.08729e+08, +16384, 2048, 4470, 741845, 0, 1.32472e+08, 1.09419e+08, 1.71381e+07, 4.16796e+06, 31972, 2.63975e+08, 172, 201088, 2.29133e+06, 51763, 1.26324e+08, 3.56758e+06, 0, 35490, 1.32472e+08, 103, 0, 5, 386195, 1.03381e+08, 3, 5.62654e+06, 25577, 1.09419e+08, +16384, 2048, 3400, 965903, 1, 1.327e+08, 1.12812e+08, 1.70294e+07, 4.19479e+06, 32751, 2.67739e+08, 171, 198986, 2.27399e+06, 52605, 1.26108e+08, 4.0309e+06, 0, 35788, 1.327e+08, 100, 0, 4, 384704, 1.06738e+08, 3, 5.66421e+06, 25619, 1.12812e+08, +16384, 2048, 4497, 965993, 0, 1.31249e+08, 1.07818e+08, 1.71362e+07, 4.09722e+06, 34492, 2.61305e+08, 144, 199319, 2.33545e+06, 49958, 1.25041e+08, 3.58758e+06, 0, 35952, 1.31249e+08, 107, 0, 5, 379982, 1.01684e+08, 6, 5.7273e+06, 26135, 1.07818e+08, diff --git a/benchmark/bench_CQRRP/results/EPYC-7513x2/HQRRP_runtime_breakdown_32768_cols_32768_b_sz_start_32_b_sz_end_2048_d_factor_1.000000.dat b/benchmark/bench_CQRRP/results/EPYC-7513x2/HQRRP_runtime_breakdown_32768_cols_32768_b_sz_start_32_b_sz_end_2048_d_factor_1.000000.dat new file mode 100644 index 00000000..413434ef --- /dev/null +++ b/benchmark/bench_CQRRP/results/EPYC-7513x2/HQRRP_runtime_breakdown_32768_cols_32768_b_sz_start_32_b_sz_end_2048_d_factor_1.000000.dat @@ -0,0 +1,35 @@ +32768, 32, 57, 245951, 0, 4.00264e+07, 8.80568e+06, 1.40776e+08, 3.05297e+06, 149459, 1.93056e+08, 68979, 1.43721e+06, 1.35868e+07, 28115, 7.4647e+06, 1.7361e+07, 0, 78991, 4.00258e+07, 1580, 0, 11, 1.37264e+06, 4.22555e+06, 6, 3.16273e+06, 42764, 8.80528e+06, +32768, 32, 1025, 206424, 6, 3.97001e+07, 8.43109e+06, 1.41018e+08, 3.10727e+06, 143611, 1.92607e+08, 66332, 1.40681e+06, 1.31762e+07, 28916, 7.41898e+06, 1.75235e+07, 0, 78872, 3.96996e+07, 1852, 0, 4, 1.33482e+06, 3.86919e+06, 8, 3.18234e+06, 42566, 8.43078e+06, +32768, 32, 7510, 206186, 0, 3.93175e+07, 8.37082e+06, 1.40667e+08, 3.14096e+06, 140817, 1.9185e+08, 65521, 1.39837e+06, 1.39686e+07, 28765, 7.31063e+06, 1.64666e+07, 0, 78521, 3.9317e+07, 1627, 0, 0, 1.29384e+06, 3.85631e+06, 4, 3.1762e+06, 42530, 8.37051e+06, +32768, 32, 6100, 202916, 0, 4.07299e+07, 8.78847e+06, 1.41401e+08, 3.08056e+06, 139633, 1.94349e+08, 65821, 1.41139e+06, 1.33134e+07, 29149, 7.70355e+06, 1.81253e+07, 0, 80771, 4.07294e+07, 1729, 0, 7, 1.448e+06, 3.97843e+06, 0, 3.31743e+06, 42561, 8.78815e+06, +32768, 32, 5817, 206080, 0, 3.97391e+07, 8.52627e+06, 1.41778e+08, 3.11182e+06, 143000, 1.9351e+08, 66074, 1.43318e+06, 1.32883e+07, 28873, 7.39979e+06, 1.74433e+07, 0, 79064, 3.97386e+07, 1720, 0, 4, 1.35575e+06, 3.8949e+06, 7, 3.23083e+06, 42750, 8.52596e+06, +32768, 64, 7141, 226580, 0, 4.3886e+07, 1.39464e+07, 9.51786e+07, 3.19958e+06, 124681, 1.56569e+08, 24759, 1.08383e+06, 1.31358e+07, 26936, 1.15397e+07, 1.79955e+07, 0, 79126, 4.38857e+07, 1346, 0, 18, 1.87567e+06, 8.28309e+06, 2, 3.74416e+06, 41945, 1.39462e+07, +32768, 64, 7936, 227331, 0, 4.4386e+07, 1.38585e+07, 9.81575e+07, 3.06784e+06, 141341, 1.59846e+08, 31034, 1.15792e+06, 1.32844e+07, 26756, 1.15463e+07, 1.82605e+07, 0, 78868, 4.43857e+07, 1166, 0, 4, 1.83783e+06, 8.22011e+06, 0, 3.75711e+06, 42085, 1.38583e+07, +32768, 64, 7228, 226671, 0, 4.35306e+07, 1.39113e+07, 9.68405e+07, 3.02228e+06, 121216, 1.5766e+08, 27965, 1.08383e+06, 1.31506e+07, 27576, 1.15341e+07, 1.7629e+07, 0, 77289, 4.35304e+07, 1339, 0, 0, 1.77189e+06, 8.20434e+06, 0, 3.89147e+06, 42090, 1.39111e+07, +32768, 64, 8447, 227262, 0, 4.41445e+07, 1.39649e+07, 9.6701e+07, 3.0876e+06, 121213, 1.58255e+08, 28082, 1.10249e+06, 1.31199e+07, 27019, 1.12807e+07, 1.85076e+07, 0, 78508, 4.41442e+07, 1361, 0, 5, 1.86766e+06, 8.29366e+06, 0, 3.75985e+06, 42193, 1.39647e+07, +32768, 64, 9670, 225916, 0, 4.33874e+07, 1.42249e+07, 9.70151e+07, 3.17793e+06, 120265, 1.58161e+08, 24776, 1.0895e+06, 1.31559e+07, 26735, 1.13672e+07, 1.76443e+07, 0, 78643, 4.33871e+07, 1370, 0, 17, 2.01869e+06, 8.27227e+06, 0, 3.8901e+06, 42347, 1.42248e+07, +32768, 128, 8449, 273336, 0, 6.1479e+07, 1.84434e+07, 6.78546e+07, 3.02913e+06, 90961, 1.51179e+08, 12671, 910299, 1.56125e+07, 36256, 2.39588e+07, 2.08736e+07, 0, 74615, 6.14788e+07, 909, 0, 1, 1.42514e+06, 1.37159e+07, 2, 3.2596e+06, 41656, 1.84432e+07, +32768, 128, 10820, 313772, 0, 6.02347e+07, 1.84361e+07, 7.22632e+07, 2.90964e+06, 91849, 1.5426e+08, 18838, 1.00248e+06, 1.55234e+07, 33298, 2.41168e+07, 1.94663e+07, 0, 73353, 6.02345e+07, 946, 0, 4, 1.34816e+06, 1.37158e+07, 3, 3.32958e+06, 41478, 1.84359e+07, +32768, 128, 8643, 274967, 0, 6.31198e+07, 1.8343e+07, 6.95343e+07, 2.91301e+06, 85651, 1.54279e+08, 14806, 919103, 1.56987e+07, 32584, 2.40085e+07, 2.23714e+07, 0, 74538, 6.31196e+07, 968, 0, 0, 1.47879e+06, 1.35138e+07, 5, 3.30753e+06, 41654, 1.83428e+07, +32768, 128, 9548, 272095, 0, 6.26174e+07, 1.84052e+07, 7.0442e+07, 2.95971e+06, 86713, 1.54793e+08, 19639, 1.0106e+06, 1.55644e+07, 34668, 2.40412e+07, 2.18733e+07, 0, 73512, 6.26173e+07, 916, 0, 3, 1.4277e+06, 1.35993e+07, 3, 3.33566e+06, 41433, 1.8405e+07, +32768, 128, 9702, 272756, 0, 6.33836e+07, 1.86754e+07, 6.89796e+07, 3.0289e+06, 85709, 1.54436e+08, 14623, 916213, 1.56983e+07, 36529, 2.4276e+07, 2.23672e+07, 0, 74491, 6.33834e+07, 953, 0, 2, 1.49301e+06, 1.37984e+07, 7, 3.34146e+06, 41458, 1.86753e+07, +32768, 256, 8770, 429507, 0, 1.34706e+08, 5.16987e+07, 6.28746e+07, 3.80978e+06, 66414, 2.53594e+08, 9044, 864255, 1.58306e+07, 88652, 9.75196e+07, 2.03114e+07, 0, 82716, 1.34706e+08, 616, 0, 4, 1.76861e+06, 4.62475e+07, 0, 3.63874e+06, 43093, 5.16985e+07, +32768, 256, 8294, 421394, 0, 1.31483e+08, 5.05856e+07, 6.20623e+07, 3.83337e+06, 66120, 2.4846e+08, 7645, 823218, 1.63115e+07, 87580, 9.45768e+07, 1.95937e+07, 0, 82152, 1.31483e+08, 641, 0, 0, 1.64897e+06, 4.52648e+07, 0, 3.62766e+06, 43531, 5.05856e+07, +32768, 256, 8782, 498082, 0, 1.37546e+08, 5.31575e+07, 6.23871e+07, 3.84334e+06, 69076, 2.5751e+08, 8509, 840052, 1.598e+07, 97025, 9.93867e+07, 2.11511e+07, 0, 82421, 1.37546e+08, 613, 0, 0, 1.85741e+06, 4.75282e+07, 2, 3.72825e+06, 42938, 5.31574e+07, +32768, 256, 8916, 530892, 0, 1.35013e+08, 5.263e+07, 6.26238e+07, 3.84285e+06, 71787, 2.54722e+08, 8362, 819963, 1.58619e+07, 91407, 9.65824e+07, 2.1567e+07, 0, 82225, 1.35013e+08, 713, 0, 2, 1.90518e+06, 4.69219e+07, 0, 3.75834e+06, 43730, 5.26299e+07, +32768, 256, 9277, 411569, 0, 1.332e+08, 5.23337e+07, 6.24979e+07, 3.84175e+06, 65874, 2.5236e+08, 7454, 822291, 1.57734e+07, 93445, 9.65487e+07, 1.98728e+07, 0, 81896, 1.332e+08, 637, 0, 0, 1.77665e+06, 4.68257e+07, 3, 3.68735e+06, 43230, 5.23336e+07, +32768, 512, 595, 1.05978e+06, 0, 2.33262e+08, 1.16255e+08, 6.01011e+07, 5.67097e+06, 100190, 4.16449e+08, 3276, 795560, 1.53629e+07, 146115, 1.96279e+08, 2.05929e+07, 0, 82462, 2.33262e+08, 390, 0, 5, 1.63889e+06, 1.10203e+08, 0, 4.36725e+06, 45108, 1.16255e+08, +32768, 512, 769, 802700, 0, 2.39304e+08, 1.19281e+08, 5.99413e+07, 5.66716e+06, 103086, 4.25101e+08, 2640, 781574, 1.55347e+07, 156109, 2.01278e+08, 2.14689e+07, 0, 82006, 2.39304e+08, 576, 0, 0, 1.72525e+06, 1.13018e+08, 0, 4.49337e+06, 44541, 1.19281e+08, +32768, 512, 574, 791136, 0, 2.21438e+08, 1.07906e+08, 6.22801e+07, 5.88063e+06, 98035, 3.98395e+08, 2030, 787715, 1.50698e+07, 129662, 1.86196e+08, 1.91698e+07, 0, 83238, 2.21438e+08, 527, 0, 6, 1.49514e+06, 1.01898e+08, 2, 4.4671e+06, 45407, 1.07906e+08, +32768, 512, 836, 809300, 0, 2.34223e+08, 1.13506e+08, 6.08115e+07, 5.75047e+06, 99905, 4.15202e+08, 2547, 781804, 1.55147e+07, 151938, 1.9696e+08, 2.07305e+07, 0, 81705, 2.34223e+08, 590, 0, 1, 1.54281e+06, 1.07596e+08, 4, 4.32265e+06, 44174, 1.13506e+08, +32768, 512, 814, 812221, 0, 2.37966e+08, 1.18514e+08, 6.04826e+07, 5.71818e+06, 94873, 4.23588e+08, 2352, 782734, 1.55545e+07, 154613, 1.99889e+08, 2.15e+07, 0, 82074, 2.37966e+08, 513, 0, 5, 1.66852e+06, 1.12388e+08, 3, 4.41172e+06, 44823, 1.18514e+08, +32768, 1024, 3293, 1.49979e+06, 0, 3.38794e+08, 2.18168e+08, 6.46479e+07, 7.62214e+06, 105474, 6.3084e+08, 1203, 763229, 1.54093e+07, 178051, 3.03621e+08, 1.8739e+07, 0, 81892, 3.38794e+08, 221, 0, 0, 1.4968e+06, 2.09942e+08, 0, 6.68318e+06, 45617, 2.18168e+08, +32768, 1024, 1575, 1.57192e+06, 0, 3.43929e+08, 2.25157e+08, 6.49867e+07, 7.56997e+06, 97133, 6.43313e+08, 1169, 763463, 1.53952e+07, 176298, 3.08406e+08, 1.91044e+07, 0, 82255, 3.43929e+08, 259, 0, 5, 1.55154e+06, 2.16648e+08, 9, 6.90993e+06, 46742, 2.25157e+08, +32768, 1024, 2809, 1.57014e+06, 0, 3.45777e+08, 2.28804e+08, 6.43802e+07, 7.48632e+06, 99329, 6.48121e+08, 1226, 760918, 1.53771e+07, 181940, 3.10055e+08, 1.93196e+07, 0, 82033, 3.45777e+08, 260, 0, 3, 1.61326e+06, 2.20416e+08, 2, 6.7287e+06, 46240, 2.28804e+08, +32768, 1024, 3081, 1.58798e+06, 0, 3.42017e+08, 2.30422e+08, 6.42357e+07, 7.6737e+06, 101775, 6.46041e+08, 1170, 763851, 1.55893e+07, 181878, 3.06806e+08, 1.85922e+07, 0, 81806, 3.42017e+08, 254, 0, 6, 1.51666e+06, 2.21998e+08, 0, 6.86128e+06, 46166, 2.30422e+08, +32768, 1024, 1774, 1.49014e+06, 0, 3.45951e+08, 2.23e+08, 6.4875e+07, 7.58086e+06, 110909, 6.43009e+08, 1158, 764151, 1.56397e+07, 182196, 3.10411e+08, 1.88704e+07, 0, 81869, 3.45951e+08, 277, 0, 0, 1.50595e+06, 2.14776e+08, 9, 6.67147e+06, 46502, 2.23e+08, +32768, 2048, 7037, 3.11235e+06, 0, 5.60151e+08, 4.45513e+08, 6.44157e+07, 1.08803e+07, 105158, 1.08418e+09, 840, 771084, 1.53483e+07, 215488, 5.2643e+08, 1.73062e+07, 0, 78882, 5.60151e+08, 125, 0, 2, 1.51169e+06, 4.2984e+08, 10, 1.41141e+07, 47607, 4.45513e+08, +32768, 2048, 4783, 2.95076e+06, 0, 5.6477e+08, 4.48906e+08, 6.50818e+07, 1.13129e+07, 107241, 1.09313e+09, 704, 772614, 1.56612e+07, 216953, 5.30933e+08, 1.71072e+07, 0, 78547, 5.6477e+08, 126, 0, 2, 1.53912e+06, 4.33141e+08, 0, 1.41773e+07, 48160, 4.48906e+08, +32768, 2048, 7367, 2.91282e+06, 0, 5.58659e+08, 4.46637e+08, 6.55101e+07, 1.09537e+07, 104429, 1.08478e+09, 657, 773287, 1.54636e+07, 213528, 5.25087e+08, 1.70423e+07, 0, 78306, 5.58659e+08, 114, 0, 4, 1.47357e+06, 4.31067e+08, 4, 1.40477e+07, 47668, 4.46637e+08, +32768, 2048, 5374, 2.91658e+06, 0, 5.58503e+08, 4.4914e+08, 6.34319e+07, 1.08785e+07, 106220, 1.08498e+09, 687, 773610, 1.57371e+07, 217424, 5.24755e+08, 1.69402e+07, 0, 79284, 5.58503e+08, 116, 0, 2, 1.50752e+06, 4.33361e+08, 0, 1.42237e+07, 47282, 4.4914e+08, +32768, 2048, 4768, 3.02516e+06, 0, 5.613e+08, 4.45549e+08, 6.48901e+07, 1.1128e+07, 104887, 1.086e+09, 946, 780927, 1.5507e+07, 213364, 5.28157e+08, 1.65626e+07, 0, 78647, 5.613e+08, 123, 0, 3, 1.49003e+06, 4.29869e+08, 3, 1.41423e+07, 47540, 4.45549e+08, diff --git a/benchmark/bench_CQRRP/results/EPYC-7513x2/HQRRP_runtime_breakdown_65536_cols_65536_b_sz_start_32_b_sz_end_2048_d_factor_1.000000.dat b/benchmark/bench_CQRRP/results/EPYC-7513x2/HQRRP_runtime_breakdown_65536_cols_65536_b_sz_start_32_b_sz_end_2048_d_factor_1.000000.dat new file mode 100644 index 00000000..8b67877c --- /dev/null +++ b/benchmark/bench_CQRRP/results/EPYC-7513x2/HQRRP_runtime_breakdown_65536_cols_65536_b_sz_start_32_b_sz_end_2048_d_factor_1.000000.dat @@ -0,0 +1,35 @@ +65536, 32, 109, 1.02826e+06, 5, 1.84417e+08, 3.79389e+07, 9.81045e+08, 1.06083e+07, 384435, 1.21542e+09, 224047, 5.52374e+06, 7.30613e+07, 62916, 2.98189e+07, 7.5564e+07, 0, 160580, 1.84415e+08, 3046, 0, 1, 5.97258e+06, 2.17399e+07, 8, 1.01369e+07, 85720, 3.79382e+07, +65536, 32, 4666, 979531, 0, 1.86456e+08, 4.10717e+07, 9.80316e+08, 8.88833e+06, 359281, 1.21807e+09, 224636, 5.5105e+06, 7.2134e+07, 63672, 2.92179e+07, 7.91461e+07, 0, 157568, 1.86454e+08, 3604, 0, 7, 7.05014e+06, 2.13702e+07, 5, 1.25613e+07, 85837, 4.10711e+07, +65536, 32, 7704, 971128, 0, 1.85309e+08, 3.94939e+07, 9.81195e+08, 9.33793e+06, 354498, 1.21667e+09, 229073, 5.52652e+06, 7.18764e+07, 61327, 2.8945e+07, 7.85129e+07, 0, 156741, 1.85308e+08, 3523, 0, 0, 6.70864e+06, 2.11237e+07, 4, 1.15721e+07, 85428, 3.94933e+07, +65536, 32, 7550, 968076, 0, 1.88887e+08, 4.07054e+07, 9.81063e+08, 9.70302e+06, 374279, 1.22171e+09, 229448, 5.5914e+06, 7.22767e+07, 63714, 2.92043e+07, 8.13641e+07, 0, 156427, 1.88886e+08, 3468, 0, 2, 7.06923e+06, 2.17723e+07, 2, 1.1774e+07, 85800, 4.07048e+07, +65536, 32, 8324, 969045, 0, 1.81509e+08, 3.83204e+07, 9.8573e+08, 9.62803e+06, 403310, 1.21657e+09, 225881, 5.71083e+06, 7.13741e+07, 61078, 2.87546e+07, 7.52242e+07, 0, 156875, 1.81507e+08, 2679, 0, 0, 6.7481e+06, 2.08639e+07, 11, 1.06189e+07, 86180, 3.83198e+07, +65536, 64, 9282, 1.01076e+06, 0, 2.43812e+08, 4.9046e+07, 5.98997e+08, 1.05082e+07, 299019, 9.03682e+08, 81070, 4.41927e+06, 8.32158e+07, 91633, 7.20776e+07, 8.37623e+07, 0, 163841, 2.43812e+08, 3003, 0, 7, 5.81618e+06, 3.32962e+07, 7, 9.84585e+06, 84173, 4.90454e+07, +65536, 64, 11553, 998221, 4, 2.56634e+08, 4.78293e+07, 5.9215e+08, 1.03759e+07, 300886, 9.083e+08, 88293, 4.25034e+06, 8.46189e+07, 88949, 7.24776e+07, 9.49444e+07, 0, 165131, 2.56634e+08, 2474, 0, 0, 5.87025e+06, 3.25767e+07, 2, 9.2948e+06, 84243, 4.78285e+07, +65536, 64, 4889, 1.01187e+06, 0, 2.42685e+08, 4.85945e+07, 5.97539e+08, 1.04733e+07, 314844, 9.00623e+08, 81267, 4.25524e+06, 8.31191e+07, 86457, 7.31193e+07, 8.18555e+07, 0, 166907, 2.42684e+08, 2610, 0, 7, 6.01843e+06, 3.32246e+07, 2, 9.26384e+06, 84203, 4.85937e+07, +65536, 64, 7333, 1.00858e+06, 0, 2.58236e+08, 4.76443e+07, 5.94395e+08, 1.05685e+07, 301521, 9.12162e+08, 90328, 4.25183e+06, 8.41009e+07, 89846, 7.47654e+07, 9.47691e+07, 0, 168068, 2.58235e+08, 2825, 0, 0, 5.87337e+06, 3.29082e+07, 7, 8.77474e+06, 84409, 4.76436e+07, +65536, 64, 14035, 1.00711e+06, 0, 2.52567e+08, 4.67938e+07, 5.98974e+08, 1.05535e+07, 302260, 9.10212e+08, 87184, 4.26885e+06, 8.3666e+07, 93691, 7.34105e+07, 9.08766e+07, 0, 163920, 2.52567e+08, 2929, 0, 4, 5.5223e+06, 3.24896e+07, 2, 8.69364e+06, 84636, 4.67931e+07, +65536, 128, 12867, 1.19193e+06, 0, 4.05526e+08, 1.18745e+08, 4.09114e+08, 1.0731e+07, 304550, 9.45627e+08, 36851, 3.62399e+06, 8.37169e+07, 190695, 2.35591e+08, 8.22046e+07, 0, 161563, 4.05526e+08, 2129, 0, 2, 6.88186e+06, 1.00855e+08, 49, 1.0918e+07, 88088, 1.18745e+08, +65536, 128, 66, 1.18931e+06, 0, 4.01718e+08, 1.17875e+08, 4.06448e+08, 1.04527e+07, 279515, 9.37963e+08, 55038, 3.6495e+06, 8.15391e+07, 173592, 2.34989e+08, 8.11468e+07, 0, 164615, 4.01718e+08, 1828, 0, 13, 7.03505e+06, 9.98084e+07, 0, 1.09403e+07, 89149, 1.17875e+08, +65536, 128, 69, 1.17455e+06, 0, 4.03621e+08, 1.18135e+08, 4.04139e+08, 1.0323e+07, 285195, 9.37678e+08, 60650, 3.71563e+06, 8.12799e+07, 170984, 2.36556e+08, 8.16743e+07, 0, 162951, 4.03621e+08, 2036, 0, 10, 7.07993e+06, 1.00229e+08, 13, 1.07349e+07, 88380, 1.18135e+08, +65536, 128, 57, 1.20871e+06, 0, 4.10824e+08, 1.15459e+08, 4.12098e+08, 1.05473e+07, 271994, 9.50409e+08, 59010, 3.77736e+06, 8.24645e+07, 173481, 2.43427e+08, 8.07608e+07, 0, 160781, 4.10823e+08, 1930, 0, 3, 6.75866e+06, 9.77543e+07, 2, 1.08554e+07, 88488, 1.15459e+08, +65536, 128, 94, 1.19095e+06, 0, 4.02574e+08, 1.18734e+08, 4.10142e+08, 1.04154e+07, 276081, 9.43333e+08, 59070, 3.7918e+06, 8.19326e+07, 183631, 2.38152e+08, 7.82919e+07, 0, 162772, 4.02574e+08, 2059, 0, 3, 6.75783e+06, 1.00854e+08, 6, 1.10325e+07, 87914, 1.18734e+08, +65536, 256, 238, 1.66963e+06, 0, 6.29934e+08, 2.24172e+08, 3.64322e+08, 1.42026e+07, 335841, 1.23464e+09, 28125, 3.26458e+06, 7.78147e+07, 248228, 4.60263e+08, 8.81405e+07, 0, 174254, 6.29934e+08, 999, 0, 0, 6.129e+06, 2.07555e+08, 6, 1.03966e+07, 90486, 2.24172e+08, +65536, 256, 170, 1.71633e+06, 0, 6.3064e+08, 2.25674e+08, 3.66261e+08, 1.42474e+07, 336228, 1.23888e+09, 30320, 3.31259e+06, 7.83358e+07, 249381, 4.62453e+08, 8.60866e+07, 0, 172353, 6.3064e+08, 1163, 0, 4, 6.02053e+06, 2.09163e+08, 3, 1.03987e+07, 91178, 2.25674e+08, +65536, 256, 326, 1.68956e+06, 0, 6.35123e+08, 2.24875e+08, 3.63305e+08, 1.42111e+07, 327719, 1.23953e+09, 29129, 3.26596e+06, 7.763e+07, 246376, 4.6562e+08, 8.81575e+07, 0, 174365, 6.35123e+08, 935, 0, 4, 5.89748e+06, 2.08641e+08, 11, 1.02444e+07, 90498, 2.24875e+08, +65536, 256, 156, 1.68145e+06, 0, 6.32769e+08, 2.30304e+08, 3.65127e+08, 1.45636e+07, 339147, 1.24478e+09, 28678, 3.26942e+06, 7.76172e+07, 255628, 4.64717e+08, 8.67081e+07, 0, 172875, 6.32769e+08, 1110, 0, 12, 6.16336e+06, 2.13523e+08, 3, 1.05253e+07, 91281, 2.30304e+08, +65536, 256, 189, 1.77631e+06, 0, 6.38395e+08, 2.29038e+08, 3.65761e+08, 1.42238e+07, 325957, 1.24952e+09, 30744, 3.32142e+06, 7.81754e+07, 253452, 4.69321e+08, 8.71202e+07, 0, 173166, 6.38395e+08, 1228, 0, 7, 6.27234e+06, 2.12444e+08, 2, 1.02284e+07, 91036, 2.29037e+08, +65536, 512, 291, 3.22198e+06, 0, 9.57186e+08, 4.30808e+08, 3.67716e+08, 2.13546e+07, 362074, 1.78065e+09, 7740, 3.09907e+06, 7.43904e+07, 329435, 7.95616e+08, 8.35672e+07, 0, 175581, 9.57186e+08, 940, 0, 5, 5.77294e+06, 4.11486e+08, 7, 1.34553e+07, 92387, 4.30807e+08, +65536, 512, 661, 3.20132e+06, 0, 9.72218e+08, 4.30989e+08, 3.59356e+08, 2.14718e+07, 364945, 1.7876e+09, 7140, 3.08542e+06, 7.48118e+07, 324702, 8.11803e+08, 8.20111e+07, 0, 174735, 9.72218e+08, 977, 0, 5, 5.62936e+06, 4.12002e+08, 5, 1.32652e+07, 91897, 4.30989e+08, +65536, 512, 651, 3.18838e+06, 0, 1.01409e+09, 4.34356e+08, 3.69496e+08, 2.18384e+07, 378301, 1.84335e+09, 6139, 3.09201e+06, 7.60905e+07, 348360, 8.53788e+08, 8.059e+07, 0, 172439, 1.01409e+09, 896, 0, 4, 5.53793e+06, 4.15152e+08, 8, 1.35719e+07, 92815, 4.34356e+08, +65536, 512, 492, 3.19447e+06, 0, 9.72722e+08, 4.26933e+08, 3.60194e+08, 2.12526e+07, 372135, 1.78467e+09, 7821, 3.08878e+06, 7.42399e+07, 321647, 8.11262e+08, 8.36255e+07, 0, 176665, 9.72722e+08, 1069, 0, 11, 5.58425e+06, 4.07776e+08, 7, 1.34803e+07, 91875, 4.26933e+08, +65536, 512, 616, 3.1751e+06, 0, 9.37876e+08, 4.24576e+08, 3.62807e+08, 2.14057e+07, 364223, 1.7502e+09, 8206, 3.09044e+06, 7.40136e+07, 314600, 7.77484e+08, 8.27881e+07, 0, 176852, 9.37876e+08, 1020, 0, 0, 5.57372e+06, 4.05624e+08, 0, 1.32847e+07, 92854, 4.24576e+08, +65536, 1024, 2018, 6.00256e+06, 0, 1.51248e+09, 8.83343e+08, 3.85341e+08, 2.596e+07, 373704, 2.8135e+09, 4295, 3.00231e+06, 7.61568e+07, 374826, 1.35194e+09, 8.08234e+07, 0, 176360, 1.51248e+09, 633, 0, 5, 5.61286e+06, 8.53633e+08, 9, 2.40023e+07, 93905, 8.83343e+08, +65536, 1024, 2672, 5.84242e+06, 0, 1.47589e+09, 8.8396e+08, 3.86221e+08, 2.58363e+07, 389332, 2.77814e+09, 3930, 3.00629e+06, 7.65917e+07, 386556, 1.31603e+09, 7.96991e+07, 0, 173117, 1.47589e+09, 651, 0, 4, 5.65658e+06, 8.54032e+08, 5, 2.41758e+07, 94722, 8.8396e+08, +65536, 1024, 3363, 6.15631e+06, 0, 1.55854e+09, 8.89813e+08, 3.8231e+08, 2.59678e+07, 392573, 2.86319e+09, 3703, 3.00148e+06, 7.77775e+07, 414152, 1.39569e+09, 8.14845e+07, 0, 172818, 1.55854e+09, 831, 0, 3, 5.79228e+06, 8.59801e+08, 7, 2.41236e+07, 95034, 8.89813e+08, +65536, 1024, 2045, 6.09746e+06, 0, 1.53253e+09, 8.86623e+08, 3.89716e+08, 2.60054e+07, 385517, 2.84136e+09, 3819, 3.00484e+06, 7.71789e+07, 417460, 1.37043e+09, 8.13251e+07, 0, 171803, 1.53253e+09, 819, 0, 5, 5.73564e+06, 8.57108e+08, 5, 2.36835e+07, 94854, 8.86623e+08, +65536, 1024, 2518, 6.20922e+06, 0, 1.53515e+09, 8.88116e+08, 3.90389e+08, 2.59202e+07, 390070, 2.84617e+09, 3739, 3.00113e+06, 7.78112e+07, 423855, 1.37236e+09, 8.13763e+07, 0, 171385, 1.53515e+09, 702, 0, 0, 5.73106e+06, 8.58794e+08, 4, 2.34961e+07, 94698, 8.88116e+08, +65536, 2048, 6440, 1.12229e+07, 0, 2.34786e+09, 1.81292e+09, 3.64475e+08, 3.83347e+07, 392573, 4.57522e+09, 1924, 3.03462e+06, 8.01052e+07, 474966, 2.19085e+09, 7.32264e+07, 0, 170979, 2.34786e+09, 392, 0, 7, 6.02657e+06, 1.76845e+09, 8, 3.83449e+07, 98635, 1.81292e+09, +65536, 2048, 5594, 1.12643e+07, 0, 2.35271e+09, 1.81047e+09, 3.63259e+08, 3.75531e+07, 401952, 4.57566e+09, 2087, 3.03834e+06, 7.98642e+07, 479380, 2.19643e+09, 7.27248e+07, 0, 170583, 2.35271e+09, 393, 0, 5, 5.91811e+06, 1.76629e+09, 1, 3.81677e+07, 97714, 1.81047e+09, +65536, 2048, 6627, 1.1273e+07, 0, 2.308e+09, 1.7936e+09, 3.69624e+08, 3.7679e+07, 390139, 4.52057e+09, 2040, 3.03714e+06, 7.90549e+07, 447454, 2.153e+09, 7.22819e+07, 0, 172836, 2.308e+09, 492, 0, 7, 5.85516e+06, 1.74911e+09, 2, 3.85339e+07, 97389, 1.7936e+09, +65536, 2048, 7430, 1.12153e+07, 0, 2.32467e+09, 1.80514e+09, 3.66322e+08, 3.76516e+07, 397431, 4.5454e+09, 2191, 3.03591e+06, 7.89934e+07, 427217, 2.16928e+09, 7.27595e+07, 0, 175105, 2.32467e+09, 303, 0, 2, 6.03983e+06, 1.76033e+09, 1, 3.8668e+07, 96645, 1.80514e+09, +65536, 2048, 2237, 1.12428e+07, 0, 2.31358e+09, 1.79709e+09, 3.6434e+08, 3.72606e+07, 392241, 4.52391e+09, 1978, 3.04476e+06, 7.89867e+07, 429836, 2.15905e+09, 7.18918e+07, 0, 172377, 2.31358e+09, 321, 0, 0, 5.89368e+06, 1.75236e+09, 1, 3.87392e+07, 96949, 1.79709e+09, diff --git a/benchmark/bench_CQRRP/results/EPYC-7513x2/HQRRP_runtime_breakdown_8192_cols_8192_b_sz_start_32_b_sz_end_2048_d_factor_1.000000.dat b/benchmark/bench_CQRRP/results/EPYC-7513x2/HQRRP_runtime_breakdown_8192_cols_8192_b_sz_start_32_b_sz_end_2048_d_factor_1.000000.dat new file mode 100644 index 00000000..0b07e124 --- /dev/null +++ b/benchmark/bench_CQRRP/results/EPYC-7513x2/HQRRP_runtime_breakdown_8192_cols_8192_b_sz_start_32_b_sz_end_2048_d_factor_1.000000.dat @@ -0,0 +1,35 @@ +8192, 32, 54, 50819, 0, 1.34174e+06, 1.10143e+06, 8.10109e+06, 293729, 18519, 1.09074e+07, 4945, 101896, 154633, 1238, 454483, 605160, 0, 19227, 1.34158e+06, 921, 0, 0, 48523, 204991, 1, 836438, 10440, 1.10131e+06, +8192, 32, 511, 13191, 0, 1.31666e+06, 1.08188e+06, 7.95092e+06, 300167, 21107, 1.06844e+07, 3547, 101447, 138764, 1138, 448213, 604418, 0, 18975, 1.3165e+06, 1121, 0, 2, 48479, 206314, 10, 815375, 10476, 1.08178e+06, +8192, 32, 206, 11879, 0, 1.36144e+06, 1.0882e+06, 7.98353e+06, 310260, 19136, 1.07746e+07, 3944, 101276, 182530, 1138, 448919, 603959, 0, 19509, 1.36128e+06, 1091, 0, 0, 48404, 213270, 5, 814800, 10500, 1.08807e+06, +8192, 32, 191, 13432, 0, 1.37336e+06, 1.09016e+06, 8.05022e+06, 316895, 19552, 1.08638e+07, 3782, 102135, 199228, 1183, 445720, 601758, 0, 19377, 1.37318e+06, 1098, 1, 0, 48396, 208556, 1, 821506, 10495, 1.09005e+06, +8192, 32, 196, 13288, 0, 1.37288e+06, 1.08612e+06, 7.99766e+06, 310763, 18794, 1.07997e+07, 3770, 102057, 199615, 1133, 445835, 601035, 0, 19258, 1.3727e+06, 1026, 0, 0, 48326, 208571, 0, 817614, 10480, 1.08602e+06, +8192, 64, 44, 21045, 0, 1.61728e+06, 1.21892e+06, 5.97905e+06, 386630, 17498, 9.24047e+06, 2277, 74780, 169329, 479, 745258, 605710, 0, 19366, 1.6172e+06, 497, 0, 0, 49994, 412203, 0, 745716, 10445, 1.21886e+06, +8192, 64, 1704, 19641, 0, 1.61206e+06, 1.26726e+06, 5.94946e+06, 387834, 19999, 9.25796e+06, 3177, 84771, 167031, 441, 743570, 593917, 0, 19065, 1.61197e+06, 526, 0, 0, 49518, 457815, 0, 749242, 10109, 1.26721e+06, +8192, 64, 232, 21610, 0, 1.59723e+06, 1.30148e+06, 5.93136e+06, 388073, 20538, 9.26052e+06, 3533, 88410, 135420, 550, 767564, 582594, 0, 19081, 1.59715e+06, 523, 0, 0, 47985, 487276, 3, 755386, 10263, 1.30144e+06, +8192, 64, 177, 20035, 0, 1.64211e+06, 1.32491e+06, 6.00202e+06, 397833, 19923, 9.40701e+06, 3739, 92766, 174957, 436, 776316, 574788, 0, 19017, 1.64202e+06, 566, 0, 0, 48066, 519057, 0, 746914, 10262, 1.32486e+06, +8192, 64, 154, 17210, 0, 1.60868e+06, 1.27994e+06, 5.95433e+06, 389035, 20684, 9.27004e+06, 3289, 86384, 154884, 496, 762850, 581800, 0, 18907, 1.60861e+06, 591, 0, 0, 47822, 470649, 0, 750492, 10333, 1.27989e+06, +8192, 128, 38, 22994, 0, 1.98997e+06, 1.49861e+06, 4.85201e+06, 285401, 13878, 8.6629e+06, 1064, 69326, 141418, 344, 1.19393e+06, 566239, 0, 17609, 1.98993e+06, 365, 0, 0, 46518, 788195, 0, 653368, 10144, 1.49859e+06, +8192, 128, 2607, 20357, 0, 1.95387e+06, 1.47262e+06, 4.79972e+06, 293903, 10646, 8.55373e+06, 844, 60289, 139872, 289, 1.15954e+06, 575893, 0, 17096, 1.95383e+06, 358, 0, 0, 46805, 782004, 2, 633214, 10220, 1.4726e+06, +8192, 128, 244, 22091, 0, 1.95753e+06, 1.47836e+06, 4.78902e+06, 292487, 10486, 8.55022e+06, 912, 64318, 140211, 252, 1.16835e+06, 566001, 0, 17439, 1.95749e+06, 385, 0, 0, 46533, 789735, 4, 631436, 10239, 1.47833e+06, +8192, 128, 345, 27952, 0, 1.95303e+06, 1.467e+06, 4.76895e+06, 296557, 11295, 8.52512e+06, 958, 63835, 129755, 277, 1.16519e+06, 575754, 0, 17220, 1.95299e+06, 368, 0, 2, 46616, 785138, 0, 624689, 10159, 1.46697e+06, +8192, 128, 602, 30037, 0, 2.03831e+06, 1.49594e+06, 4.83394e+06, 281181, 10073, 8.69007e+06, 946, 64333, 145110, 285, 1.21219e+06, 597513, 0, 17893, 2.03827e+06, 395, 0, 0, 47157, 794440, 0, 643704, 10204, 1.4959e+06, +8192, 256, 74, 52976, 0, 2.81813e+06, 2.26407e+06, 4.25789e+06, 361726, 5799, 9.76067e+06, 438, 52513, 165547, 172, 2.02376e+06, 557304, 0, 18379, 2.81812e+06, 208, 0, 0, 45653, 1.53414e+06, 0, 674021, 10037, 2.26406e+06, +8192, 256, 2516, 52020, 0, 2.80611e+06, 2.29256e+06, 4.22216e+06, 364308, 5718, 9.7454e+06, 443, 52639, 182960, 171, 1.99101e+06, 559745, 0, 19120, 2.80609e+06, 189, 0, 2, 45510, 1.54486e+06, 0, 691927, 10054, 2.29255e+06, +8192, 256, 318, 53778, 0, 2.78603e+06, 2.26906e+06, 4.22932e+06, 362091, 6478, 9.70708e+06, 429, 55134, 173695, 182, 1.97876e+06, 558845, 0, 18966, 2.78601e+06, 206, 0, 0, 45515, 1.56535e+06, 0, 647891, 10090, 2.26905e+06, +8192, 256, 368, 53690, 0, 2.75906e+06, 2.24833e+06, 4.1904e+06, 353997, 6116, 9.61197e+06, 456, 55431, 169099, 187, 1.95678e+06, 558239, 0, 18852, 2.75905e+06, 226, 0, 0, 45472, 1.49552e+06, 1, 696992, 10108, 2.24832e+06, +8192, 256, 410, 53394, 0, 2.74444e+06, 2.26061e+06, 4.1576e+06, 367758, 7499, 9.59172e+06, 467, 54922, 182743, 200, 1.93064e+06, 556764, 0, 18688, 2.74442e+06, 229, 0, 0, 45516, 1.50364e+06, 0, 701118, 10092, 2.2606e+06, +8192, 512, 77, 101099, 0, 4.57149e+06, 3.83121e+06, 3.84092e+06, 536796, 5941, 1.28875e+07, 269, 54528, 251039, 1065, 3.69827e+06, 547685, 0, 18618, 4.57148e+06, 188, 0, 0, 44852, 2.87759e+06, 0, 898516, 10050, 3.83119e+06, +8192, 512, 595, 103278, 0, 4.69477e+06, 3.8001e+06, 3.79601e+06, 532686, 5746, 1.29332e+07, 221, 50237, 250712, 1323, 3.82586e+06, 547549, 0, 18854, 4.69476e+06, 214, 0, 0, 44906, 2.8625e+06, 0, 882513, 9950, 3.80009e+06, +8192, 512, 192, 106166, 0, 4.56196e+06, 3.7722e+06, 3.81519e+06, 545968, 6206, 1.28079e+07, 213, 51434, 242563, 1189, 3.70082e+06, 546961, 0, 18768, 4.56194e+06, 195, 0, 0, 44847, 2.85902e+06, 0, 858131, 9990, 3.77219e+06, +8192, 512, 212, 72056, 0, 4.72776e+06, 3.78196e+06, 3.78277e+06, 532116, 5679, 1.29026e+07, 210, 50444, 247761, 1169, 3.86268e+06, 546487, 0, 18988, 4.72774e+06, 207, 0, 0, 44815, 2.85384e+06, 1, 873042, 10036, 3.78194e+06, +8192, 512, 417, 103129, 0, 4.64336e+06, 3.79696e+06, 3.79503e+06, 543489, 7021, 1.28894e+07, 249, 50301, 240491, 1000, 3.78256e+06, 550309, 0, 18432, 4.64335e+06, 184, 0, 0, 44771, 2.84728e+06, 0, 894691, 10016, 3.79694e+06, +8192, 1024, 1510, 166195, 0, 1.53977e+07, 1.10938e+07, 3.8837e+06, 1.12075e+06, 12964, 3.16767e+07, 146, 52418, 202790, 6857, 1.4471e+07, 647277, 0, 17204, 1.53977e+07, 137, 0, 0, 47190, 9.73933e+06, 0, 1.29615e+06, 11030, 1.10938e+07, +8192, 1024, 705, 160889, 0, 1.55796e+07, 1.06414e+07, 3.72653e+06, 1.11243e+06, 13913, 3.12355e+07, 154, 51072, 196596, 6694, 1.46477e+07, 660330, 0, 17085, 1.55796e+07, 134, 0, 0, 47357, 9.34639e+06, 0, 1.23644e+06, 11061, 1.06414e+07, +8192, 1024, 637, 158516, 0, 1.51566e+07, 1.11245e+07, 3.75949e+06, 1.13797e+06, 13622, 3.13513e+07, 91, 50855, 202660, 8006, 1.4229e+07, 648714, 0, 17227, 1.51566e+07, 78, 0, 0, 48089, 9.73575e+06, 1, 1.32876e+06, 11783, 1.11245e+07, +8192, 1024, 894, 165508, 0, 1.51458e+07, 1.11449e+07, 3.72218e+06, 1.12109e+06, 14051, 3.13144e+07, 100, 50982, 196577, 7605, 1.42049e+07, 668618, 0, 16992, 1.51458e+07, 70, 0, 4, 48354, 9.82728e+06, 0, 1.25733e+06, 11843, 1.11449e+07, +8192, 1024, 1228, 174421, 0, 1.50846e+07, 1.11669e+07, 3.7772e+06, 1.11523e+06, 14208, 3.13338e+07, 105, 51931, 213279, 7524, 1.4132e+07, 662747, 0, 17040, 1.50846e+07, 74, 0, 2, 48281, 9.82004e+06, 0, 1.28661e+06, 11888, 1.11669e+07, +8192, 2048, 4363, 284712, 0, 2.87972e+07, 2.66436e+07, 3.46957e+06, 1.54336e+06, 15290, 6.07581e+07, 55, 51872, 135467, 13803, 2.80334e+07, 547675, 0, 14903, 2.87972e+07, 38, 0, 7, 51819, 2.43332e+07, 0, 2.24516e+06, 13342, 2.66436e+07, +8192, 2048, 2619, 213220, 0, 2.89154e+07, 2.95084e+07, 3.50948e+06, 1.62117e+06, 14238, 6.37844e+07, 71, 53546, 145770, 13546, 2.81391e+07, 548542, 0, 14767, 2.89154e+07, 35, 0, 6, 52053, 2.71348e+07, 0, 2.30845e+06, 12962, 2.95083e+07, +8192, 2048, 3244, 208631, 0, 2.87189e+07, 2.71656e+07, 3.55434e+06, 1.54434e+06, 11684, 6.12068e+07, 50, 51820, 154402, 12262, 2.79378e+07, 547765, 0, 14862, 2.87189e+07, 64, 0, 0, 50566, 2.47671e+07, 0, 2.33588e+06, 12022, 2.71656e+07, +8192, 2048, 1928, 210940, 0, 2.86292e+07, 2.73938e+07, 3.53013e+06, 1.57237e+06, 13118, 6.13514e+07, 67, 51726, 150585, 11959, 2.78577e+07, 542260, 0, 14879, 2.86292e+07, 62, 0, 3, 50469, 2.49986e+07, 3, 2.33273e+06, 11896, 2.73938e+07, +8192, 2048, 1987, 274390, 0, 2.94549e+07, 2.64835e+07, 3.47342e+06, 1.5288e+06, 13002, 6.123e+07, 53, 55600, 158283, 12471, 2.86674e+07, 546275, 0, 14809, 2.94549e+07, 59, 0, 0, 50528, 2.41294e+07, 0, 2.29181e+06, 11777, 2.64835e+07, diff --git a/benchmark/bench_CQRRP/results/EPYC-7513x2/ICQRRP_subroutines_speed_comp_16384_col_start_32_col_stop_2048.dat b/benchmark/bench_CQRRP/results/EPYC-7513x2/ICQRRP_subroutines_speed_comp_16384_col_start_32_col_stop_2048.dat new file mode 100644 index 00000000..53b7b364 --- /dev/null +++ b/benchmark/bench_CQRRP/results/EPYC-7513x2/ICQRRP_subroutines_speed_comp_16384_col_start_32_col_stop_2048.dat @@ -0,0 +1,111 @@ +16384, 32, 37267, 15573, 68085, +16384, 32, 11383, 14305, 65458, +16384, 32, 11170, 14386, 66559, +16384, 32, 11130, 14253, 70069, +16384, 32, 11222, 14305, 66935, +16384, 64, 49334, 21383, 77089, +16384, 64, 48924, 21398, 78058, +16384, 64, 46178, 17802, 77391, +16384, 64, 47171, 13623, 70374, +16384, 64, 42260, 18669, 68877, +16384, 128, 114860, 24482, 87895, +16384, 128, 117031, 24738, 84243, +16384, 128, 115886, 22119, 87174, +16384, 128, 115900, 22213, 84514, +16384, 128, 113707, 22071, 87157, +16384, 256, 703584, 42026, 113183, +16384, 256, 697816, 41623, 114892, +16384, 256, 706238, 43599, 116551, +16384, 256, 700377, 43096, 116798, +16384, 256, 724501, 42635, 117004, +16384, 512, 2783891, 83847, 241783, +16384, 512, 2807587, 79547, 221425, +16384, 512, 2863999, 80134, 220154, +16384, 512, 2844821, 83192, 247164, +16384, 512, 2889773, 80197, 224974, +16384, 1024, 6784010, 217129, 603467, +16384, 1024, 6787268, 219524, 622079, +16384, 1024, 6807913, 220366, 586541, +16384, 1024, 7063713, 220067, 612129, +16384, 1024, 6814177, 218316, 647717, +16384, 2048, 17852068, 2227735, 2071623, +16384, 2048, 18576222, 2272341, 2015755, +16384, 2048, 18950735, 2207694, 2017471, +16384, 2048, 18218841, 2218236, 2042504, +16384, 2048, 18367670, 2191259, 2053922, +16384, 32, 2840, 5949, 1916, 2918, +16384, 32, 2397, 3599, 2582, 3823, +16384, 32, 2373, 3862, 2418, 3731, +16384, 32, 2445, 3680, 2815, 3874, +16384, 32, 2107, 3677, 2655, 4095, +16384, 64, 5427, 5342, 6081, 10923, +16384, 64, 4748, 5265, 5061, 11094, +16384, 64, 4533, 5053, 5203, 11036, +16384, 64, 4732, 4925, 5473, 11162, +16384, 64, 4710, 5197, 4912, 10496, +16384, 128, 10738, 12557, 11376, 20192, +16384, 128, 10791, 12069, 11626, 20795, +16384, 128, 11625, 13414, 12248, 21333, +16384, 128, 10465, 12356, 11468, 20779, +16384, 128, 11037, 12946, 11199, 20815, +16384, 256, 28265, 26793, 40057, 55856, +16384, 256, 30734, 27576, 40220, 56238, +16384, 256, 29755, 30102, 35741, 53461, +16384, 256, 26556, 29890, 35301, 53934, +16384, 256, 26533, 31801, 35830, 51966, +16384, 512, 57975, 56556, 125494, 165443, +16384, 512, 56776, 57620, 110136, 152702, +16384, 512, 57176, 57679, 109810, 150147, +16384, 512, 60538, 57644, 109661, 152917, +16384, 512, 57007, 58288, 110395, 153421, +16384, 1024, 368228, 372860, 281325, 387963, +16384, 1024, 353041, 364856, 287726, 402318, +16384, 1024, 360993, 379901, 298703, 404662, +16384, 1024, 369173, 363286, 290190, 399638, +16384, 1024, 362880, 361685, 305671, 410763, +16384, 2048, 557571, 571940, 545938, 911092, +16384, 2048, 564793, 539925, 527167, 890722, +16384, 2048, 575783, 558694, 527932, 895453, +16384, 2048, 548243, 559475, 538004, 902363, +16384, 2048, 555097, 558566, 535981, 906093, +16384, 32, 32, 1679, 7278820, 4963, 4963, +16384, 32, 32, 1397, 7235959, 4276, 4276, +16384, 32, 32, 1670, 7267169, 4759, 4759, +16384, 32, 32, 1614, 6988886, 4713, 4713, +16384, 32, 32, 1664, 7046254, 4783, 4783, +16384, 64, 32, 3234, 13892493, 24611, 24611, 18265, +16384, 64, 32, 3050, 13883057, 26087, 26087, 11649, +16384, 64, 32, 3034, 14267350, 23004, 23004, 14140, +16384, 64, 32, 2977, 13768161, 23312, 23312, 16945, +16384, 64, 32, 3103, 13827598, 26035, 26035, 17315, +16384, 128, 32, 8248, 27428983, 28300, 28300, 31758, 16285, +16384, 128, 32, 9231, 27427952, 28173, 28173, 29598, 15568, +16384, 128, 32, 8176, 27500872, 26269, 26269, 29097, 19131, +16384, 128, 32, 8323, 27566603, 27291, 27291, 28523, 14568, +16384, 128, 32, 7370, 27629460, 26101, 26101, 31845, 15356, +16384, 256, 32, 35254, 1039771, 59825, 59825, 36030, 61131, 32330, +16384, 256, 32, 33063, 1041354, 60395, 60395, 36122, 61948, 32896, +16384, 256, 32, 32894, 1048583, 60292, 60292, 35645, 58768, 31176, +16384, 256, 32, 34272, 1043765, 60398, 60398, 36224, 57494, 32417, +16384, 256, 32, 33244, 1043977, 61062, 61062, 35783, 59553, 32103, +16384, 512, 32, 64682, 1898898, 161451, 161451, 101542, 84149, 114923, 85392, +16384, 512, 32, 80327, 1918624, 160879, 160879, 102703, 79862, 112545, 88155, +16384, 512, 32, 78421, 1982641, 158452, 158452, 102179, 80058, 114395, 88871, +16384, 512, 32, 81063, 1910047, 158912, 158912, 102701, 80692, 115022, 88023, +16384, 512, 32, 77588, 1961382, 158272, 158272, 103126, 80875, 114543, 86856, +16384, 1024, 32, 151893, 3667052, 597143, 597143, 401885, 329400, 305558, 294006, 244487, +16384, 1024, 32, 145127, 3639526, 575166, 575166, 409259, 325116, 301742, 296449, 241194, +16384, 1024, 32, 146689, 3718630, 587940, 587940, 410796, 326594, 304277, 304116, 245843, +16384, 1024, 32, 105616, 3465986, 575264, 575264, 405960, 326652, 295297, 295683, 241840, +16384, 1024, 32, 149074, 3678053, 572419, 572419, 410489, 329163, 298578, 293367, 233000, +16384, 2048, 32, 368564, 6847835, 2636448, 2636448, 1695694, 1210898, 978454, 850512, 911255, 883718, +16384, 2048, 32, 363013, 6842951, 2583708, 2583708, 1683805, 1202001, 989565, 882157, 889778, 878748, +16384, 2048, 32, 370270, 6962359, 2581280, 2581280, 1722309, 1219038, 985007, 868937, 898802, 855030, +16384, 2048, 32, 373295, 7100955, 2751471, 2751471, 1638923, 1196999, 1000687, 869963, 912198, 877934, +16384, 2048, 32, 369870, 6979184, 2692001, 2692001, 1614159, 1212820, 973188, 846255, 881996, 856546, + +WIDE QRCP: m n GEQP3 LUQR CQRRPT + +TSQR: m n GEQRF GEQR CHOLQR CHOLQR_ORHR + +APPLY Q: m n nb_strart ORMQR GEMM GEMQRT(varying NB) diff --git a/benchmark/bench_CQRRP/results/EPYC-7513x2/ICQRRP_subroutines_speed_comp_32768_col_start_32_col_stop_2048.dat b/benchmark/bench_CQRRP/results/EPYC-7513x2/ICQRRP_subroutines_speed_comp_32768_col_start_32_col_stop_2048.dat new file mode 100644 index 00000000..0635c759 --- /dev/null +++ b/benchmark/bench_CQRRP/results/EPYC-7513x2/ICQRRP_subroutines_speed_comp_32768_col_start_32_col_stop_2048.dat @@ -0,0 +1,111 @@ +32768, 32, 51281, 27018, 125837, +32768, 32, 22813, 26303, 127822, +32768, 32, 27545, 25840, 122764, +32768, 32, 29328, 26168, 127468, +32768, 32, 23968, 26209, 123228, +32768, 64, 83411, 23483, 134833, +32768, 64, 79082, 21524, 129589, +32768, 64, 84499, 23776, 134044, +32768, 64, 80883, 22945, 134455, +32768, 64, 75998, 23000, 134405, +32768, 128, 517043, 37016, 148437, +32768, 128, 457002, 36892, 141637, +32768, 128, 510662, 36737, 142092, +32768, 128, 461180, 37531, 141465, +32768, 128, 553675, 37807, 142767, +32768, 256, 1871800, 66373, 187300, +32768, 256, 1938620, 64728, 180407, +32768, 256, 1956097, 65518, 185019, +32768, 256, 1942342, 65916, 181428, +32768, 256, 1955617, 66082, 186224, +32768, 512, 6351083, 134248, 329853, +32768, 512, 6412669, 136804, 311298, +32768, 512, 6309943, 136991, 327923, +32768, 512, 6312438, 136854, 317389, +32768, 512, 6236268, 137432, 305656, +32768, 1024, 13635379, 424513, 764416, +32768, 1024, 14018006, 408735, 846336, +32768, 1024, 13747553, 412174, 812368, +32768, 1024, 15311947, 421828, 789887, +32768, 1024, 15219320, 406607, 821801, +32768, 2048, 38291139, 987136, 2210990, +32768, 2048, 38338106, 960734, 2160596, +32768, 2048, 38539819, 980534, 2225349, +32768, 2048, 38090393, 972351, 2251917, +32768, 2048, 38000085, 1007294, 2157336, +32768, 32, 3277, 3734, 3961, 4828, +32768, 32, 2722, 3562, 2179, 2801, +32768, 32, 3543, 3638, 3661, 4453, +32768, 32, 2763, 3564, 3745, 4562, +32768, 32, 3508, 3533, 3731, 4558, +32768, 64, 7610, 7212, 6836, 11296, +32768, 64, 7459, 7570, 7423, 11700, +32768, 64, 7560, 7382, 6841, 11355, +32768, 64, 7588, 7479, 7086, 11575, +32768, 64, 7461, 7519, 6804, 11599, +32768, 128, 21694, 22718, 12747, 21441, +32768, 128, 21300, 19028, 11843, 21199, +32768, 128, 20204, 19824, 12814, 21535, +32768, 128, 19744, 18958, 12905, 21617, +32768, 128, 20134, 19007, 12903, 22128, +32768, 256, 39834, 40521, 37334, 56492, +32768, 256, 41588, 40749, 40905, 59466, +32768, 256, 41898, 40007, 40257, 59318, +32768, 256, 41715, 40758, 40198, 58922, +32768, 256, 41496, 39886, 42014, 60507, +32768, 512, 93852, 92498, 112794, 157345, +32768, 512, 87899, 89513, 119532, 167420, +32768, 512, 90967, 89561, 100332, 145221, +32768, 512, 90338, 90646, 114719, 161634, +32768, 512, 91824, 91048, 142323, 188220, +32768, 1024, 342426, 342352, 352210, 459395, +32768, 1024, 341360, 343278, 423067, 529302, +32768, 1024, 340221, 340540, 403769, 510110, +32768, 1024, 341530, 341173, 377423, 484477, +32768, 1024, 341634, 367911, 399093, 507223, +32768, 2048, 615369, 626774, 557915, 843661, +32768, 2048, 618228, 625052, 555642, 871018, +32768, 2048, 619552, 624574, 552357, 862025, +32768, 2048, 620454, 620601, 656006, 973492, +32768, 2048, 624007, 622883, 570558, 886491, +32768, 32, 32, 2325, 28546162, 5838, 5838, +32768, 32, 32, 2776, 28485570, 3849, 3849, +32768, 32, 32, 2924, 28534451, 3704, 3704, +32768, 32, 32, 2742, 29627716, 3671, 3671, +32768, 32, 32, 2864, 28548885, 3803, 3803, +32768, 64, 32, 6076, 56268127, 8440, 8440, 4859, +32768, 64, 32, 6298, 55546329, 8993, 8993, 5027, +32768, 64, 32, 6215, 55807537, 8666, 8666, 4611, +32768, 64, 32, 6123, 55869444, 8501, 8501, 4537, +32768, 64, 32, 6192, 56175969, 8534, 8534, 4639, +32768, 128, 32, 18016, 111529524, 19519, 19519, 13486, 10182, +32768, 128, 32, 16760, 111388994, 21220, 21220, 13504, 9654, +32768, 128, 32, 17016, 111529686, 19832, 19832, 13495, 9614, +32768, 128, 32, 16940, 111578154, 19740, 19740, 13158, 10574, +32768, 128, 32, 15969, 110120208, 19891, 19891, 13145, 9566, +32768, 256, 32, 50848, 2453195, 56580, 56580, 38785, 36650, 29744, +32768, 256, 32, 51806, 2448723, 54805, 54805, 38027, 36068, 27995, +32768, 256, 32, 50470, 2353654, 53102, 53102, 37394, 36432, 28243, +32768, 256, 32, 51985, 2310883, 53094, 53094, 38360, 36233, 28391, +32768, 256, 32, 47526, 2242947, 52881, 52881, 39009, 37783, 28590, +32768, 512, 32, 106424, 4264403, 189411, 189411, 127766, 94906, 108888, 86947, +32768, 512, 32, 132031, 4286205, 191229, 191229, 129464, 94881, 109447, 89855, +32768, 512, 32, 132935, 4244412, 197030, 197030, 130083, 95455, 107254, 83358, +32768, 512, 32, 127835, 4294125, 192058, 192058, 128867, 93325, 108164, 85718, +32768, 512, 32, 105670, 4112623, 197788, 197788, 130059, 94089, 112408, 86219, +32768, 1024, 32, 303141, 7942249, 954958, 954958, 517057, 371307, 365306, 290871, 246281, +32768, 1024, 32, 300907, 7699200, 929911, 929911, 523947, 361991, 360757, 269472, 251214, +32768, 1024, 32, 304795, 7929355, 898076, 898076, 505426, 346101, 341046, 274121, 238154, +32768, 1024, 32, 324515, 7874864, 920987, 920987, 522746, 365906, 350344, 298335, 246044, +32768, 1024, 32, 317961, 7992129, 912425, 912425, 515743, 361094, 349618, 295740, 234796, +32768, 2048, 32, 673751, 14338284, 3120343, 3120343, 1958337, 1375946, 1099046, 986679, 847036, 725130, +32768, 2048, 32, 666995, 14495960, 3134966, 3134966, 1874768, 1403048, 1094628, 925089, 799602, 722386, +32768, 2048, 32, 652730, 14125546, 2951391, 2951391, 1837108, 1350262, 1042502, 935664, 795567, 731259, +32768, 2048, 32, 656769, 14146992, 2837370, 2837370, 1842778, 1406845, 1090978, 951732, 784950, 789523, +32768, 2048, 32, 662992, 14588065, 2846685, 2846685, 1940558, 1386412, 1155232, 1008293, 825502, 796752, + +WIDE QRCP: m n GEQP3 LUQR CQRRPT + +TSQR: m n GEQRF GEQR CHOLQR CHOLQR_ORHR + +APPLY Q: m n nb_strart ORMQR GEMM GEMQRT(varying NB) diff --git a/benchmark/bench_CQRRP/results/EPYC-7513x2/ICQRRP_subroutines_speed_comp_65536_col_start_32_col_stop_2048.dat b/benchmark/bench_CQRRP/results/EPYC-7513x2/ICQRRP_subroutines_speed_comp_65536_col_start_32_col_stop_2048.dat new file mode 100644 index 00000000..262145a1 --- /dev/null +++ b/benchmark/bench_CQRRP/results/EPYC-7513x2/ICQRRP_subroutines_speed_comp_65536_col_start_32_col_stop_2048.dat @@ -0,0 +1,111 @@ +65536, 32, 78591, 50206, 239667, +65536, 32, 43341, 48501, 237838, +65536, 32, 44144, 58678, 237164, +65536, 32, 43935, 48387, 236841, +65536, 32, 47643, 48326, 238294, +65536, 64, 355896, 37726, 249974, +65536, 64, 362178, 35184, 248687, +65536, 64, 356653, 35511, 249452, +65536, 64, 434529, 36323, 247813, +65536, 64, 368627, 36586, 248323, +65536, 128, 1219682, 61654, 275057, +65536, 128, 1275870, 63133, 272365, +65536, 128, 1278849, 62882, 274460, +65536, 128, 1201627, 62792, 272148, +65536, 128, 1254536, 62728, 272914, +65536, 256, 3744119, 113149, 381735, +65536, 256, 3974186, 113201, 378609, +65536, 256, 3995874, 109022, 381890, +65536, 256, 3957709, 114354, 389454, +65536, 256, 4182641, 110952, 388411, +65536, 512, 13468623, 238148, 622040, +65536, 512, 13476303, 235285, 620680, +65536, 512, 13694686, 234011, 648633, +65536, 512, 13988615, 237673, 653347, +65536, 512, 13933374, 231287, 653545, +65536, 1024, 33521441, 738047, 1367671, +65536, 1024, 28713052, 732682, 1404164, +65536, 1024, 26964036, 733888, 1321193, +65536, 1024, 26813855, 716905, 1392916, +65536, 1024, 27950777, 727056, 1332520, +65536, 2048, 79304414, 1609167, 2844528, +65536, 2048, 84186462, 1664285, 2763498, +65536, 2048, 76862640, 1628408, 2845244, +65536, 2048, 80503060, 1662368, 2877273, +65536, 2048, 80965703, 1644332, 2830678, +65536, 32, 5876, 3585, 6583, 7388, +65536, 32, 6043, 4050, 7190, 7888, +65536, 32, 6337, 4307, 6960, 7743, +65536, 32, 5753, 3920, 7260, 8022, +65536, 32, 6056, 4284, 7144, 7917, +65536, 64, 14489, 10901, 11739, 17043, +65536, 64, 13870, 9015, 11012, 16181, +65536, 64, 15323, 9421, 10778, 15859, +65536, 64, 15991, 7379, 11263, 16023, +65536, 64, 14637, 8051, 11292, 16615, +65536, 128, 28514, 27285, 30193, 38600, +65536, 128, 26383, 24015, 26633, 37472, +65536, 128, 33965, 30860, 28132, 37252, +65536, 128, 37469, 27615, 28761, 37908, +65536, 128, 34263, 27363, 32240, 41486, +65536, 256, 67181, 65963, 113821, 138516, +65536, 256, 61292, 62521, 123614, 149574, +65536, 256, 60497, 61054, 116238, 139352, +65536, 256, 59267, 62324, 113948, 139388, +65536, 256, 59910, 59369, 107252, 129754, +65536, 512, 144859, 143683, 311106, 361683, +65536, 512, 141987, 142894, 330055, 381441, +65536, 512, 144947, 145493, 288925, 342181, +65536, 512, 147533, 144837, 277078, 328812, +65536, 512, 143299, 146450, 312875, 361188, +65536, 1024, 652238, 652672, 802133, 943651, +65536, 1024, 656018, 653845, 798076, 940563, +65536, 1024, 654816, 653073, 832091, 973865, +65536, 1024, 647050, 649522, 764568, 907198, +65536, 1024, 643847, 651677, 784459, 928509, +65536, 2048, 1230048, 1186042, 755303, 1202054, +65536, 2048, 1297926, 1194798, 774193, 1211265, +65536, 2048, 1204602, 1208679, 773140, 1216878, +65536, 2048, 1195612, 1200487, 767867, 1212265, +65536, 2048, 1213251, 1210900, 764478, 1196999, +65536, 32, 32, 3983, 113218567, 5538, 5538, +65536, 32, 32, 5118, 113183765, 4947, 4947, +65536, 32, 32, 4910, 113310073, 5005, 5005, +65536, 32, 32, 4927, 113260939, 4925, 4925, +65536, 32, 32, 5235, 113350192, 4827, 4827, +65536, 64, 32, 10227, 223872860, 13923, 13923, 8277, +65536, 64, 32, 9202, 223894438, 13398, 13398, 7599, +65536, 64, 32, 9727, 224015552, 12673, 12673, 7869, +65536, 64, 32, 10017, 224017326, 14089, 14089, 7349, +65536, 64, 32, 10580, 224147246, 13852, 13852, 7862, +65536, 128, 32, 29994, 444517225, 29403, 29403, 33816, 22413, +65536, 128, 32, 27781, 445067069, 29546, 29546, 40246, 23902, +65536, 128, 32, 39115, 447312702, 30079, 30079, 36205, 21802, +65536, 128, 32, 47417, 449046654, 31586, 31586, 35344, 24235, +65536, 128, 32, 44585, 447442742, 30885, 30885, 33220, 21264, +65536, 256, 32, 119290, 7560126, 93123, 93123, 69091, 66950, 52860, +65536, 256, 32, 115445, 7466647, 90557, 90557, 63197, 67589, 49812, +65536, 256, 32, 96627, 7458930, 90504, 90504, 62290, 69904, 52945, +65536, 256, 32, 98594, 7361490, 89750, 89750, 65085, 68561, 52786, +65536, 256, 32, 95829, 7365082, 93698, 93698, 61589, 65623, 55654, +65536, 512, 32, 300454, 13115920, 406764, 406764, 228232, 170453, 185717, 119342, +65536, 512, 32, 326936, 13042452, 392275, 392275, 230075, 168897, 193530, 125737, +65536, 512, 32, 354062, 12946033, 395571, 395571, 217746, 167822, 180876, 122482, +65536, 512, 32, 325534, 12995512, 393016, 393016, 217836, 166015, 192103, 127943, +65536, 512, 32, 330828, 12810238, 380735, 380735, 215078, 166741, 185818, 123154, +65536, 1024, 32, 691801, 23023964, 1168448, 1168448, 757781, 564179, 504193, 429956, 378103, +65536, 1024, 32, 594255, 22950163, 1191649, 1191649, 724911, 559997, 475463, 426109, 366075, +65536, 1024, 32, 560473, 23056254, 1165800, 1165800, 748687, 551732, 495709, 436352, 365809, +65536, 1024, 32, 705054, 22883542, 1167055, 1167055, 739364, 554780, 479892, 414908, 367170, +65536, 1024, 32, 515151, 22886973, 1164657, 1164657, 729081, 573600, 506863, 426831, 371228, +65536, 2048, 32, 1285143, 42895123, 3650818, 3650818, 2678946, 1860502, 1701988, 1848153, 1356416, 1251704, +65536, 2048, 32, 1291752, 43150664, 3545933, 3545933, 2716595, 1857091, 1811552, 1708114, 1342937, 1265708, +65536, 2048, 32, 1240352, 42766420, 3505626, 3505626, 2728765, 1896689, 1824288, 1799925, 1336233, 1134526, +65536, 2048, 32, 1371493, 43035088, 3671161, 3671161, 2757613, 1964370, 1799177, 1778499, 1419123, 1111825, +65536, 2048, 32, 1341854, 42933052, 3589252, 3589252, 2791142, 1963518, 1895036, 1940490, 1291235, 1139370, + +WIDE QRCP: m n GEQP3 LUQR CQRRPT + +TSQR: m n GEQRF GEQR CHOLQR CHOLQR_ORHR + +APPLY Q: m n nb_strart ORMQR GEMM GEMQRT(varying NB) diff --git a/benchmark/bench_CQRRP/results/EPYC-7513x2/ICQRRP_subroutines_speed_comp_8192_col_start_32_col_stop_2048.dat b/benchmark/bench_CQRRP/results/EPYC-7513x2/ICQRRP_subroutines_speed_comp_8192_col_start_32_col_stop_2048.dat new file mode 100644 index 00000000..3f25724d --- /dev/null +++ b/benchmark/bench_CQRRP/results/EPYC-7513x2/ICQRRP_subroutines_speed_comp_8192_col_start_32_col_stop_2048.dat @@ -0,0 +1,111 @@ +8192, 32, 32407, 10683, 33294, +8192, 32, 5697, 7955, 36085, +8192, 32, 6197, 7873, 33466, +8192, 32, 5552, 7785, 33106, +8192, 32, 5533, 7591, 36865, +8192, 64, 27615, 10016, 37060, +8192, 64, 21356, 9324, 37062, +8192, 64, 21469, 9206, 35210, +8192, 64, 21665, 10018, 35580, +8192, 64, 20886, 10161, 35272, +8192, 128, 52229, 17613, 44097, +8192, 128, 52974, 17246, 44871, +8192, 128, 57749, 17741, 44111, +8192, 128, 52482, 29869, 47312, +8192, 128, 53920, 16500, 43877, +8192, 256, 212039, 28544, 78289, +8192, 256, 201365, 26907, 80326, +8192, 256, 204241, 26628, 77994, +8192, 256, 199584, 25746, 76729, +8192, 256, 204190, 28742, 77958, +8192, 512, 815089, 55474, 176019, +8192, 512, 819857, 69512, 177422, +8192, 512, 874083, 55260, 170725, +8192, 512, 834745, 66704, 160104, +8192, 512, 845640, 62497, 165062, +8192, 1024, 2901473, 327416, 460466, +8192, 1024, 2943813, 330958, 451286, +8192, 1024, 2907110, 334054, 464284, +8192, 1024, 2887130, 330376, 458126, +8192, 1024, 2919197, 333391, 466229, +8192, 2048, 8466334, 1105594, 2173208, +8192, 2048, 8236924, 1115844, 2133972, +8192, 2048, 8275097, 1104650, 2176848, +8192, 2048, 8270712, 1108152, 2141078, +8192, 2048, 8498177, 1111993, 2123349, +8192, 32, 1745, 4097, 954, 1704, +8192, 32, 1469, 3601, 988, 1604, +8192, 32, 2632, 3461, 1041, 1656, +8192, 32, 1409, 3590, 920, 1536, +8192, 32, 2559, 3556, 954, 1625, +8192, 64, 4307, 4994, 2172, 6460, +8192, 64, 3504, 4835, 2294, 6571, +8192, 64, 4766, 4943, 2266, 6541, +8192, 64, 3383, 4989, 2430, 6783, +8192, 64, 3267, 4845, 2274, 6488, +8192, 128, 8743, 11885, 5963, 18512, +8192, 128, 8104, 11933, 6212, 14135, +8192, 128, 7142, 11517, 7552, 17470, +8192, 128, 8219, 11653, 6745, 16037, +8192, 128, 8137, 11900, 7155, 16612, +8192, 256, 23513, 17527, 28330, 45254, +8192, 256, 17698, 21065, 23099, 39522, +8192, 256, 22048, 21762, 22801, 38963, +8192, 256, 24888, 18114, 27375, 43404, +8192, 256, 19386, 24658, 24425, 40971, +8192, 512, 46062, 43444, 80709, 116337, +8192, 512, 50130, 43184, 78347, 116388, +8192, 512, 45604, 48374, 78358, 114420, +8192, 512, 47494, 44404, 75742, 112670, +8192, 512, 43844, 45491, 77567, 113985, +8192, 1024, 238570, 239842, 182115, 319906, +8192, 1024, 243322, 221728, 181556, 313921, +8192, 1024, 243513, 256756, 180612, 315176, +8192, 1024, 232688, 222820, 181531, 313016, +8192, 1024, 240627, 227800, 181535, 314162, +8192, 2048, 370199, 367709, 512009, 842443, +8192, 2048, 369881, 377572, 501064, 836163, +8192, 2048, 373442, 373333, 507494, 846605, +8192, 2048, 397937, 371370, 489945, 822176, +8192, 2048, 377980, 373812, 502332, 837401, +8192, 32, 32, 2744, 1758017, 2600, 2600, +8192, 32, 32, 653, 1712013, 2963, 2963, +8192, 32, 32, 746, 1690902, 3101, 3101, +8192, 32, 32, 630, 1685158, 3009, 3009, +8192, 32, 32, 710, 1690525, 2987, 2987, +8192, 64, 32, 1566, 3327563, 12980, 12980, 9646, +8192, 64, 32, 1995, 3296941, 13026, 13026, 13189, +8192, 64, 32, 1287, 3804798, 12574, 12574, 7449, +8192, 64, 32, 1299, 3520011, 12469, 12469, 12640, +8192, 64, 32, 1315, 3521886, 12516, 12516, 12535, +8192, 128, 32, 5995, 6563314, 17518, 17518, 23566, 12567, +8192, 128, 32, 4801, 6561107, 16490, 16490, 23499, 13352, +8192, 128, 32, 4932, 6586709, 16633, 16633, 26919, 13438, +8192, 128, 32, 4562, 6559409, 16288, 16288, 29021, 13429, +8192, 128, 32, 4704, 6585129, 16220, 16220, 23835, 14099, +8192, 256, 32, 28205, 475761, 39185, 39185, 24707, 41237, 25105, +8192, 256, 32, 24589, 472288, 37917, 37917, 24760, 42803, 25868, +8192, 256, 32, 26359, 479038, 37429, 37429, 23773, 40161, 25260, +8192, 256, 32, 25939, 479660, 37792, 37792, 24695, 43730, 25542, +8192, 256, 32, 22094, 478745, 37800, 37800, 24925, 41620, 28780, +8192, 512, 32, 37854, 879152, 98995, 98995, 73892, 61079, 102890, 59675, +8192, 512, 32, 42128, 869916, 99477, 99477, 73852, 58828, 92754, 59913, +8192, 512, 32, 39693, 873882, 100814, 100814, 72742, 62271, 92258, 60424, +8192, 512, 32, 39056, 860355, 99196, 99196, 74364, 57345, 93411, 61512, +8192, 512, 32, 38978, 887483, 98096, 98096, 75081, 57649, 96153, 60681, +8192, 1024, 32, 93518, 1626486, 391891, 391891, 278385, 237822, 230083, 262406, 273168, +8192, 1024, 32, 92414, 1616219, 393930, 393930, 272084, 233380, 227259, 258563, 265863, +8192, 1024, 32, 92831, 1645679, 385690, 385690, 272941, 235376, 227672, 253095, 266542, +8192, 1024, 32, 93800, 1608071, 386076, 386076, 277353, 229005, 226186, 263379, 263027, +8192, 1024, 32, 93342, 1629724, 386139, 386139, 276227, 232476, 225102, 255132, 263916, +8192, 2048, 32, 258725, 3080670, 1329654, 1329654, 1044336, 896500, 831824, 744832, 858017, 747190, +8192, 2048, 32, 261624, 3039765, 1338985, 1338985, 1028005, 899008, 841485, 748617, 864268, 759886, +8192, 2048, 32, 258872, 3007479, 1331975, 1331975, 1019506, 901131, 831358, 749817, 849672, 733717, +8192, 2048, 32, 227416, 3008225, 1348077, 1348077, 1069693, 902078, 870383, 753381, 850322, 725845, +8192, 2048, 32, 226250, 3046555, 1349231, 1349231, 1042068, 902984, 853493, 753541, 849984, 731981, + +WIDE QRCP: m n GEQP3 LUQR CQRRPT + +TSQR: m n GEQRF GEQR CHOLQR CHOLQR_ORHR + +APPLY Q: m n nb_strart ORMQR GEMM GEMQRT(varying NB) diff --git a/benchmark/bench_CQRRP/results/EPYC-7513x2/ICQRRP_time_raw_rows_16384_cols_16384_b_sz_start_256_b_sz_end_2048_d_factor_1.000000.dat b/benchmark/bench_CQRRP/results/EPYC-7513x2/ICQRRP_time_raw_rows_16384_cols_16384_b_sz_start_256_b_sz_end_2048_d_factor_1.000000.dat new file mode 100644 index 00000000..1afb2c3a --- /dev/null +++ b/benchmark/bench_CQRRP/results/EPYC-7513x2/ICQRRP_time_raw_rows_16384_cols_16384_b_sz_start_256_b_sz_end_2048_d_factor_1.000000.dat @@ -0,0 +1,12 @@ +21739268, 39075725, 44315678, 38487385, 38500717, 6313810, 589469252, +19865689, 35347118, 44527683, 38235625, 37909903, 6201376, 578035000, +20291659, 36278762, 43785323, 38880379, 38638384, 6193121, 591134724, +16430007, 55583595, 86243851, 66361750, 66184680, 6190757, 0, +16997228, 53984913, 85163291, 67835775, 66096942, 6268912, 0, +16821623, 57249674, 85851349, 65254374, 67195646, 6241533, 0, +17540230, 69660265, 149879422, 107227590, 104234098, 6342558, 0, +16989187, 71390556, 156292992, 103947364, 104211476, 6262272, 0, +17071797, 71665222, 152760383, 106578385, 100330166, 6222743, 0, +25235040, 93189073, 264708165, 163003545, 162339940, 6253983, 0, +24219878, 93539388, 271390898, 163195384, 156916982, 6283212, 0, +24365102, 93112050, 269204523, 162946863, 159375951, 6321401, 0, diff --git a/benchmark/bench_CQRRP/results/EPYC-7513x2/ICQRRP_time_raw_rows_32768_cols_32768_b_sz_start_256_b_sz_end_2048_d_factor_1.000000.dat b/benchmark/bench_CQRRP/results/EPYC-7513x2/ICQRRP_time_raw_rows_32768_cols_32768_b_sz_start_256_b_sz_end_2048_d_factor_1.000000.dat new file mode 100644 index 00000000..b1a128bd --- /dev/null +++ b/benchmark/bench_CQRRP/results/EPYC-7513x2/ICQRRP_time_raw_rows_32768_cols_32768_b_sz_start_256_b_sz_end_2048_d_factor_1.000000.dat @@ -0,0 +1,12 @@ +90218656, 200398800, 253532615, 208540302, 203035850, 32308482, 4179688714, +89446230, 198880707, 252551331, 209832187, 206725030, 32178278, 4123476901, +102574496, 193055224, 256484456, 211485581, 208307697, 32474422, 4161457215, +73603620, 284934503, 418561898, 311144704, 316475941, 32088152, 0, +74718986, 276479033, 423122959, 311796860, 311134065, 32536586, 0, +72818229, 270781850, 418714700, 303413263, 312954691, 32292249, 0, +68247912, 301743410, 635521092, 435162622, 432222845, 32729685, 0, +68211207, 300819816, 653773867, 432497687, 428933069, 32672010, 0, +69135030, 311249490, 640982177, 426603842, 439539651, 32380921, 0, +75391222, 381219204, 1094440725, 661979854, 655006761, 32441029, 0, +74092221, 371148262, 1089890414, 666624680, 653695095, 32581980, 0, +75408999, 372041024, 1101543795, 664009882, 653765109, 32716286, 0, diff --git a/benchmark/bench_CQRRP/results/EPYC-7513x2/ICQRRP_time_raw_rows_65536_cols_65536_b_sz_start_256_b_sz_end_2048_d_factor_1.000000.dat b/benchmark/bench_CQRRP/results/EPYC-7513x2/ICQRRP_time_raw_rows_65536_cols_65536_b_sz_start_256_b_sz_end_2048_d_factor_1.000000.dat new file mode 100644 index 00000000..75b3f21c --- /dev/null +++ b/benchmark/bench_CQRRP/results/EPYC-7513x2/ICQRRP_time_raw_rows_65536_cols_65536_b_sz_start_256_b_sz_end_2048_d_factor_1.000000.dat @@ -0,0 +1,12 @@ +623679661, 1104822463, 1293651681, 1059481168, 1062191763, 272255871, 32434424510, +611403265, 1075820090, 1298711591, 1060976729, 1061665403, 255639492, 32732830563, +622215233, 1070412570, 1294144599, 1063284464, 1035541300, 258612138, 31961435200, +477277020, 1422292697, 1872350957, 1482407994, 1489505870, 255877514, 0, +474540646, 1385907297, 1883830187, 1485917736, 1443090197, 254876553, 0, +479038064, 1363865591, 1910488376, 1496203146, 1466648481, 267668055, 0, +416416563, 1416207914, 2891509395, 2015779545, 2014884865, 258384095, 0, +412909215, 1419072578, 2898797501, 2075792047, 2065084950, 261002907, 0, +414639712, 1433343605, 2918226873, 1974094912, 2067823475, 267134979, 0, +390433703, 1644769316, 4582056687, 2819234391, 2805286785, 259214795, 0, +391730985, 1655363424, 4572740502, 2847843810, 2827732997, 259987471, 0, +387171870, 1650356507, 4563310685, 2826877557, 2775531832, 265641750, 0, diff --git a/benchmark/bench_CQRRP/results/EPYC-7513x2/ICQRRP_time_raw_rows_8192_cols_8192_b_sz_start_256_b_sz_end_2048_d_factor_1.000000.dat b/benchmark/bench_CQRRP/results/EPYC-7513x2/ICQRRP_time_raw_rows_8192_cols_8192_b_sz_start_256_b_sz_end_2048_d_factor_1.000000.dat new file mode 100644 index 00000000..28462682 --- /dev/null +++ b/benchmark/bench_CQRRP/results/EPYC-7513x2/ICQRRP_time_raw_rows_8192_cols_8192_b_sz_start_256_b_sz_end_2048_d_factor_1.000000.dat @@ -0,0 +1,12 @@ +4856115, 7510378, 9611800, 8490797, 8490107, 1784414, 89372043, +4741415, 8108813, 9689149, 8512208, 8506117, 1608479, 90204735, +5045912, 8764159, 9645436, 8535477, 8459878, 1647547, 90106247, +4486255, 10154821, 12798075, 10340475, 10482902, 1656346, 0, +4535183, 9847762, 12799234, 10307665, 10555873, 1657402, 0, +4537126, 10154288, 12926744, 10295948, 10474654, 1716051, 0, +5639506, 15178530, 32216184, 22885440, 23450316, 1716554, 0, +5822033, 15344994, 31058637, 22746608, 22335482, 1716329, 0, +5786699, 15367519, 31405730, 22395525, 22196706, 1744703, 0, +7749283, 24388377, 60574350, 37168964, 38094493, 1750234, 0, +7707568, 24548764, 59819565, 37806951, 38306474, 1769761, 0, +7641212, 25118370, 60965609, 38258742, 38176318, 1690542, 0, diff --git a/benchmark/bench_CQRRP/results/EPYC-7513x2/QR_speed_comp_131072_col_start_256_col_stop_8192.dat b/benchmark/bench_CQRRP/results/EPYC-7513x2/QR_speed_comp_131072_col_start_256_col_stop_8192.dat new file mode 100644 index 00000000..8897c69d --- /dev/null +++ b/benchmark/bench_CQRRP/results/EPYC-7513x2/QR_speed_comp_131072_col_start_256_col_stop_8192.dat @@ -0,0 +1,30 @@ +256, 529467, 81770, 318905, 843361, 880189, +256, 221763, 66876, 347507, 830367, 875768, +256, 223577, 109785, 325558, 895900, 931257, +256, 212390, 85743, 270254, 930172, 968061, +256, 208672, 78136, 265848, 881599, 917792, +512, 441817, 346304, 1835037, 1064221, 1149448, +512, 404841, 339567, 1852182, 976922, 1240616, +512, 419142, 326450, 1876501, 1101697, 1197326, +512, 389202, 323853, 2027841, 1034511, 1250158, +512, 406614, 366838, 1919158, 1112116, 1218481, +1024, 1440281, 1051430, 3467033, 2013244, 2410552, +1024, 1451417, 1087165, 3137875, 1891517, 2237276, +1024, 1515797, 1002739, 2950582, 1895945, 2142502, +1024, 1396712, 985555, 3070136, 1907571, 2381906, +1024, 1428041, 1623414, 3293030, 1623225, 2154188, +2048, 3228679, 3563346, 7434178, 2097763, 3151161, +2048, 3184745, 2933530, 7191365, 2206000, 3642591, +2048, 3089783, 3267773, 6958205, 1937101, 3209200, +2048, 3183402, 4225769, 7177768, 2128666, 3380910, +2048, 3252325, 3771920, 8083886, 2155027, 3413730, +4096, 8400114, 8452365, 18504519, 6329750, 9363188, +4096, 8203725, 7679000, 18522580, 6389455, 9310929, +4096, 7998636, 8506856, 18500688, 6231988, 9231147, +4096, 8141118, 8110544, 17743669, 5635213, 8616910, +4096, 7666248, 7376722, 16412467, 5758447, 8727233, +8192, 19873545, 21357363, 53688498, 19960419, 30623327, +8192, 21576696, 21471359, 55824675, 18641397, 29022101, +8192, 20873289, 20702280, 50969446, 17056785, 26740990, +8192, 21024149, 20946465, 53168917, 18406034, 28796813, +8192, 23134119, 21235156, 55389770, 17166032, 27054054, diff --git a/benchmark/bench_CQRRP/results/EPYC-9354P/OpenMP32/CQRRP_runtime_breakdown_16384_cols_16384_b_sz_start_256_b_sz_end_1024_d_factor_1.000000.dat b/benchmark/bench_CQRRP/results/EPYC-9354P/OpenMP32/CQRRP_runtime_breakdown_16384_cols_16384_b_sz_start_256_b_sz_end_1024_d_factor_1.000000.dat new file mode 100644 index 00000000..6975e7df --- /dev/null +++ b/benchmark/bench_CQRRP/results/EPYC-9354P/OpenMP32/CQRRP_runtime_breakdown_16384_cols_16384_b_sz_start_256_b_sz_end_1024_d_factor_1.000000.dat @@ -0,0 +1,9 @@ +201688, 83, 1259464, 4421921, 1418180, 367133, 10920469, 128621, 304223, 0, 1420622, 19024224, +122201, 65, 1259793, 4209007, 1426382, 336833, 11691227, 125372, 305334, 0, 1433323, 19483155, +123998, 76, 1256473, 4432946, 1419539, 335091, 10883217, 120611, 304764, 0, 1426637, 18883813, +245722, 142, 1475021, 2489909, 1659500, 716228, 10063476, 228051, 295942, 0, 1660727, 17175218, +246618, 140, 1479562, 2509884, 1649924, 680175, 9583284, 222032, 300608, 0, 1651148, 16673451, +245329, 149, 1496229, 2480589, 1651635, 713226, 9451299, 213427, 294459, 0, 1659512, 16554219, +476667, 469, 2436203, 1816659, 2964956, 1124909, 9242259, 403755, 326052, 0, 2967045, 18794018, +472315, 459, 2460263, 1805487, 3043596, 1122580, 9070470, 407254, 320209, 0, 3045666, 18704703, +470091, 465, 2425951, 1828156, 2694464, 1132293, 8809921, 378641, 324452, 0, 2696425, 18066395, diff --git a/benchmark/bench_CQRRP/results/EPYC-9354P/OpenMP32/CQRRP_runtime_breakdown_32768_cols_32768_b_sz_start_256_b_sz_end_1024_d_factor_1.000000.dat b/benchmark/bench_CQRRP/results/EPYC-9354P/OpenMP32/CQRRP_runtime_breakdown_32768_cols_32768_b_sz_start_256_b_sz_end_1024_d_factor_1.000000.dat new file mode 100644 index 00000000..e9463b32 --- /dev/null +++ b/benchmark/bench_CQRRP/results/EPYC-9354P/OpenMP32/CQRRP_runtime_breakdown_32768_cols_32768_b_sz_start_256_b_sz_end_1024_d_factor_1.000000.dat @@ -0,0 +1,9 @@ +602418, 122, 3962155, 20649514, 1199858, 582643, 41289913, 370471, 1177302, 0, 1208512, 69843050, +491871, 87, 3974463, 20191324, 1161056, 504165, 40821920, 384110, 1178808, 0, 1179468, 68726216, +495444, 91, 3952153, 20605747, 1175133, 516828, 41831629, 381111, 1181673, 0, 1184206, 70148882, +958560, 164, 3994199, 11199392, 1431029, 809039, 39290835, 657328, 1158788, 0, 1435628, 59503933, +973172, 177, 3948737, 10755991, 1544047, 827175, 39304292, 660376, 1161251, 0, 1548768, 59179939, +971226, 163, 3955421, 10363169, 1398234, 794894, 39121169, 639858, 1161484, 0, 1402856, 58410240, +1906402, 309, 5344295, 6396423, 2067749, 1227322, 38320077, 1221419, 1145594, 0, 2070159, 57632000, +1895664, 113, 5335102, 6929200, 2302263, 1313726, 38623053, 1230325, 1164100, 0, 2304647, 58795930, +1905037, 108, 5359991, 6868871, 2413137, 1395790, 38985329, 1260258, 1162638, 0, 2415515, 59353537, diff --git a/benchmark/bench_CQRRP/results/EPYC-9354P/OpenMP32/CQRRP_runtime_breakdown_4000_cols_4000_b_sz_start_32_b_sz_end_1024_d_factor_1.000000.dat b/benchmark/bench_CQRRP/results/EPYC-9354P/OpenMP32/CQRRP_runtime_breakdown_4000_cols_4000_b_sz_start_32_b_sz_end_1024_d_factor_1.000000.dat new file mode 100644 index 00000000..2b4736d5 --- /dev/null +++ b/benchmark/bench_CQRRP/results/EPYC-9354P/OpenMP32/CQRRP_runtime_breakdown_4000_cols_4000_b_sz_start_32_b_sz_end_1024_d_factor_1.000000.dat @@ -0,0 +1,18 @@ +27824, 49, 148789, 481492, 12897, 31055, 12149490, 14545, 34503, 0, 15276, 12903023, +2033, 127, 144379, 478736, 12892, 31111, 12151045, 14377, 34373, 0, 14491, 12870672, +2119, 96, 144259, 482660, 12815, 31904, 12336536, 14436, 34344, 0, 14400, 13060754, +3567, 121, 130924, 309739, 15286, 36089, 11946349, 12595, 25789, 0, 16109, 12481282, +2908, 426, 129068, 310320, 15207, 37167, 11941628, 12509, 25896, 0, 16019, 12475941, +3234, 201, 128864, 309256, 15180, 37886, 11943817, 12551, 25759, 0, 16000, 12477568, +5705, 505, 142067, 359159, 18853, 27410, 137970, 10380, 23081, 0, 19372, 725649, +6152, 817, 140179, 356708, 19251, 27572, 139468, 10401, 23037, 0, 19730, 724064, +6356, 331, 140578, 357115, 19294, 27699, 138260, 10405, 23093, 0, 19770, 723607, +11947, 105, 150878, 230508, 19411, 34720, 93614, 17863, 21820, 0, 19670, 581125, +10234, 931, 149803, 226808, 20291, 35504, 123134, 16146, 19746, 0, 20545, 602851, +10648, 456, 148705, 231903, 18881, 34836, 97166, 16114, 19976, 0, 19118, 578922, +20925, 63, 170236, 139415, 22281, 45561, 85754, 29914, 21474, 0, 22401, 535743, +20143, 1467, 168452, 133901, 22492, 45721, 85524, 27123, 22877, 0, 22626, 527834, +19710, 2574, 168197, 134043, 21876, 45382, 84881, 27493, 22600, 0, 22020, 526900, +38745, 2596, 151060, 78142, 41793, 60875, 91307, 40034, 28586, 0, 41864, 533209, +39180, 5637, 146814, 74080, 42622, 62772, 87972, 40415, 29199, 0, 42689, 528758, +39693, 3024, 149112, 71544, 42602, 62471, 89174, 41901, 29492, 0, 42666, 529077, diff --git a/benchmark/bench_CQRRP/results/EPYC-9354P/OpenMP32/CQRRP_runtime_breakdown_8196_cols_8196_b_sz_start_256_b_sz_end_1024_d_factor_1.000000.dat b/benchmark/bench_CQRRP/results/EPYC-9354P/OpenMP32/CQRRP_runtime_breakdown_8196_cols_8196_b_sz_start_256_b_sz_end_1024_d_factor_1.000000.dat new file mode 100644 index 00000000..c48c1ff2 --- /dev/null +++ b/benchmark/bench_CQRRP/results/EPYC-9354P/OpenMP32/CQRRP_runtime_breakdown_8196_cols_8196_b_sz_start_256_b_sz_end_1024_d_factor_1.000000.dat @@ -0,0 +1,9 @@ +82753, 85, 382704, 912730, 85397, 86885, 999521, 48046, 81808, 0, 86190, 2680722, +44254, 353, 384872, 940652, 70836, 66029, 720044, 40770, 82072, 0, 71629, 2350675, +34111, 946, 388579, 913667, 74408, 67469, 740211, 41266, 81508, 0, 75177, 2342934, +65259, 544, 441908, 545607, 109098, 114472, 793850, 80389, 78783, 0, 109516, 2230328, +66409, 989, 447421, 548830, 116021, 118396, 934267, 83636, 79554, 0, 116432, 2395934, +65524, 793, 444876, 548077, 111189, 117033, 894896, 80481, 80639, 0, 111669, 2343988, +170888, 470, 835765, 370532, 170067, 159694, 730934, 136143, 88903, 0, 170302, 2663631, +134867, 441, 832370, 366300, 164259, 149232, 618331, 133548, 87475, 0, 164481, 2487045, +195577, 434, 825248, 372203, 165905, 154949, 705085, 139962, 90788, 0, 166131, 2650377, diff --git a/benchmark/bench_CQRRP/results/EPYC-9354P/OpenMP32/ICQRRP_time_raw_rows_16384_cols_16384_b_sz_start_256_b_sz_end_1024_d_factor_1.000000.dat b/benchmark/bench_CQRRP/results/EPYC-9354P/OpenMP32/ICQRRP_time_raw_rows_16384_cols_16384_b_sz_start_256_b_sz_end_1024_d_factor_1.000000.dat new file mode 100644 index 00000000..32d9d60d --- /dev/null +++ b/benchmark/bench_CQRRP/results/EPYC-9354P/OpenMP32/ICQRRP_time_raw_rows_16384_cols_16384_b_sz_start_256_b_sz_end_1024_d_factor_1.000000.dat @@ -0,0 +1,9 @@ +19675308, 29772146, 36683101, 31219481, 31102344, 8281077, 297938852, +18606036, 29816923, 35904781, 31423594, 31661580, 8187622, 298768446, +19406728, 29766795, 36309703, 31510332, 30549713, 8207170, 298162877, +16712495, 39884608, 63408116, 50110467, 48792115, 8156101, 0, +16449505, 39272000, 63480390, 49856208, 49173187, 8269157, 0, +17311482, 40095195, 63729402, 49693576, 48595628, 8276746, 0, +17265048, 46743534, 113436561, 79889337, 77694113, 8232368, 0, +17410039, 46632956, 114013085, 79767611, 77479627, 8227153, 0, +17201588, 46721596, 113959292, 80526141, 77883042, 8209893, 0, diff --git a/benchmark/bench_CQRRP/results/EPYC-9354P/OpenMP32/ICQRRP_time_raw_rows_32768_cols_32768_b_sz_start_256_b_sz_end_1024_d_factor_1.000000.dat b/benchmark/bench_CQRRP/results/EPYC-9354P/OpenMP32/ICQRRP_time_raw_rows_32768_cols_32768_b_sz_start_256_b_sz_end_1024_d_factor_1.000000.dat new file mode 100644 index 00000000..d85dc6fe --- /dev/null +++ b/benchmark/bench_CQRRP/results/EPYC-9354P/OpenMP32/ICQRRP_time_raw_rows_32768_cols_32768_b_sz_start_256_b_sz_end_1024_d_factor_1.000000.dat @@ -0,0 +1,9 @@ +71320408, 131745337, 168419954, 138309285, 135113217, 38458760, 2247894747, +70133172, 130596048, 168990444, 137080795, 137112326, 38067245, 2253206233, +68953490, 128535293, 169075124, 136975985, 136134926, 38059215, 2242733064, +58764240, 161977632, 283684682, 212089050, 209920865, 38108852, 0, +59537558, 162756655, 284514896, 211767196, 209995660, 38146639, 0, +59478775, 162728420, 284640379, 212020954, 208975802, 38156142, 0, +59078846, 186018027, 474935727, 324052857, 319686969, 38248656, 0, +58882542, 186127564, 473272398, 323763916, 318800258, 38075984, 0, +58479813, 185552444, 474928987, 325182336, 319399943, 38277470, 0, diff --git a/benchmark/bench_CQRRP/results/EPYC-9354P/OpenMP32/ICQRRP_time_raw_rows_4000_cols_4000_b_sz_start_32_b_sz_end_2048_d_factor_1.000000.dat b/benchmark/bench_CQRRP/results/EPYC-9354P/OpenMP32/ICQRRP_time_raw_rows_4000_cols_4000_b_sz_start_32_b_sz_end_2048_d_factor_1.000000.dat new file mode 100644 index 00000000..77b88861 --- /dev/null +++ b/benchmark/bench_CQRRP/results/EPYC-9354P/OpenMP32/ICQRRP_time_raw_rows_4000_cols_4000_b_sz_start_32_b_sz_end_2048_d_factor_1.000000.dat @@ -0,0 +1,21 @@ +13025243, 12900082, 659262, 650707, 646362, 157764, 0, +12909435, 12898533, 657153, 652589, 648104, 126642, 0, +13100144, 13080715, 658359, 653080, 648969, 128494, 0, +12483856, 12534893, 649893, 621169, 604693, 128010, 0, +12668485, 12725841, 654039, 622412, 608022, 127467, 0, +12693511, 12756063, 653953, 624062, 608048, 128285, 0, +719229, 862250, 806682, 702271, 660784, 127056, 0, +718828, 829961, 805432, 702513, 662524, 129134, 0, +716255, 858016, 805365, 704337, 663945, 130064, 0, +644363, 979574, 1197998, 891264, 862379, 128977, 4782159, +638508, 985731, 1186940, 890751, 861523, 129703, 4771487, +638157, 981336, 1194285, 890186, 857752, 128064, 4764594, +525661, 1385470, 1839205, 1255791, 1235997, 128661, 0, +529687, 1407230, 1863870, 1277017, 1257199, 134553, 0, +528268, 1407314, 1867016, 1278940, 1253062, 127730, 0, +540309, 1807884, 3309211, 1976838, 1952796, 130813, 0, +531726, 1750141, 3203966, 1984865, 1954259, 133085, 0, +531914, 1764606, 3234524, 1992119, 1959467, 125988, 0, +665480, 2841825, 8779902, 4767754, 4588614, 126104, 0, +668807, 2836918, 8835286, 4591099, 4598596, 126478, 0, +683940, 2844920, 8845712, 4674864, 4614201, 130460, 0, diff --git a/benchmark/bench_CQRRP/results/EPYC-9354P/OpenMP32/ICQRRP_time_raw_rows_8192_cols_8192_b_sz_start_256_b_sz_end_1024_d_factor_1.000000.dat b/benchmark/bench_CQRRP/results/EPYC-9354P/OpenMP32/ICQRRP_time_raw_rows_8192_cols_8192_b_sz_start_256_b_sz_end_1024_d_factor_1.000000.dat new file mode 100644 index 00000000..6a79a2d4 --- /dev/null +++ b/benchmark/bench_CQRRP/results/EPYC-9354P/OpenMP32/ICQRRP_time_raw_rows_8192_cols_8192_b_sz_start_256_b_sz_end_1024_d_factor_1.000000.dat @@ -0,0 +1,9 @@ +2941935, 5242415, 7207858, 5812057, 5531308, 819775, 39639856, +2762252, 5381311, 7136696, 5845655, 5763076, 776517, 39738960, +2931735, 5308159, 7143301, 5588022, 5508464, 775250, 39628228, +2388687, 6919830, 10181918, 7452682, 7488895, 780434, 0, +2396178, 6935512, 10186870, 7404785, 7233516, 784822, 0, +2299669, 7037769, 10098049, 7484905, 7377435, 782462, 0, +3082133, 9012383, 22400772, 14954903, 14662409, 781552, 0, +3029921, 9247802, 22502933, 15080276, 14781902, 797108, 0, +2922449, 8956852, 22590148, 14733204, 14755865, 779475, 0, diff --git a/benchmark/bench_CQRRP/results/EPYC-9354P/OpenMP64/CQRRP_runtime_breakdown_4000_cols_4000_b_sz_start_32_b_sz_end_1024_d_factor_1.000000.dat b/benchmark/bench_CQRRP/results/EPYC-9354P/OpenMP64/CQRRP_runtime_breakdown_4000_cols_4000_b_sz_start_32_b_sz_end_1024_d_factor_1.000000.dat new file mode 100644 index 00000000..50295390 --- /dev/null +++ b/benchmark/bench_CQRRP/results/EPYC-9354P/OpenMP64/CQRRP_runtime_breakdown_4000_cols_4000_b_sz_start_32_b_sz_end_1024_d_factor_1.000000.dat @@ -0,0 +1,18 @@ +26526, 49, 755034, 558796, 14204, 32622, 12174582, 14450, 34779, 0, 17002, 13613840, +35041, 103, 789824, 559579, 14741, 31283, 12254801, 14410, 34756, 0, 16452, 13736249, +30720, 67, 725273, 511426, 14348, 29294, 12340551, 14235, 34528, 0, 16051, 13702145, +5519, 20, 408803, 309001, 16948, 39153, 12050602, 13599, 25994, 0, 17809, 12870500, +36162, 246, 353945, 315640, 16798, 37639, 11916594, 13698, 26257, 0, 17667, 12717848, +4719, 101, 539155, 328235, 16958, 36301, 12014431, 13689, 26150, 0, 17875, 12980656, +9183, 44, 290875, 405177, 19426, 27584, 140953, 13215, 22688, 0, 19915, 929634, +5541, 544, 470503, 350313, 18245, 27399, 128550, 10591, 22613, 0, 18782, 1034836, +8101, 257, 356814, 380116, 19027, 27584, 124808, 10381, 22897, 0, 19495, 950453, +11185, 60, 288945, 220007, 20536, 36260, 118052, 18723, 20475, 0, 20778, 734485, +42066, 516, 275781, 222946, 20561, 36185, 119315, 18175, 19912, 0, 20802, 755698, +42899, 170, 231344, 204949, 20367, 35418, 121062, 18651, 19538, 0, 20601, 694632, +19900, 84, 188029, 127250, 23235, 45358, 85824, 30381, 22235, 0, 23363, 542424, +18991, 1373, 191544, 124066, 22583, 34333, 85814, 28904, 22696, 0, 22706, 530427, +19161, 1151, 188133, 124924, 22160, 34032, 85723, 29375, 22917, 0, 22294, 527710, +82333, 78, 191094, 67759, 48358, 57833, 115218, 47036, 29766, 0, 48420, 639537, +36335, 6109, 176551, 70953, 46218, 63747, 92887, 44823, 28967, 0, 46285, 566657, +44899, 129, 181633, 70865, 48424, 65479, 106457, 48795, 28105, 0, 48488, 594850, diff --git a/benchmark/bench_CQRRP/results/EPYC-9354P/OpenMP64/ICQRRP_time_raw_rows_4000_cols_4000_b_sz_start_32_b_sz_end_2048_d_factor_1.000000.dat b/benchmark/bench_CQRRP/results/EPYC-9354P/OpenMP64/ICQRRP_time_raw_rows_4000_cols_4000_b_sz_start_32_b_sz_end_2048_d_factor_1.000000.dat new file mode 100644 index 00000000..e0100076 --- /dev/null +++ b/benchmark/bench_CQRRP/results/EPYC-9354P/OpenMP64/ICQRRP_time_raw_rows_4000_cols_4000_b_sz_start_32_b_sz_end_2048_d_factor_1.000000.dat @@ -0,0 +1,21 @@ +13891087, 12919149, 675350, 628461, 618663, 157070, 0, +13562799, 13114035, 716220, 633067, 623477, 127980, 0, +13639920, 12835648, 634867, 630610, 621056, 124904, 0, +12658552, 12458819, 635324, 603558, 585480, 125961, 0, +12794586, 12610805, 636795, 605650, 585899, 129501, 0, +12679075, 12625953, 641049, 609046, 589853, 126120, 0, +1001586, 848263, 817513, 699374, 660841, 125860, 0, +995288, 883210, 817654, 706053, 667555, 124812, 0, +973389, 894161, 811829, 705799, 667780, 128634, 0, +746284, 1004419, 1185707, 890584, 857103, 127887, 4794896, +756807, 1028640, 1156555, 888655, 859022, 126303, 4763966, +741086, 1041106, 1157784, 886451, 855941, 125539, 4759208, +587143, 1423013, 1911274, 1312502, 1272291, 129233, 0, +543969, 1433830, 1901284, 1299011, 1257340, 129570, 0, +576741, 1431168, 1941958, 1292959, 1264745, 128762, 0, +542806, 1832292, 3399586, 2046122, 2022539, 128319, 0, +633910, 1768705, 3306211, 2097705, 2043625, 132425, 0, +577962, 1763561, 3329009, 2063254, 2038885, 128512, 0, +679388, 2855181, 8782200, 4665535, 4617611, 128929, 0, +737919, 2896448, 8816294, 4736367, 4605464, 132642, 0, +694165, 2844358, 8835730, 4682825, 4630735, 128296, 0, diff --git a/benchmark/bench_CQRRP/results/hopper/ICQRRP_GPU_runtime_breakdown_innerQRF_0_rows_16384_cols_16384_d_factor_1.0.dat b/benchmark/bench_CQRRP/results/hopper/ICQRRP_GPU_runtime_breakdown_innerQRF_0_rows_16384_cols_16384_d_factor_1.0.dat new file mode 100644 index 00000000..39da765f --- /dev/null +++ b/benchmark/bench_CQRRP/results/hopper/ICQRRP_GPU_runtime_breakdown_innerQRF_0_rows_16384_cols_16384_d_factor_1.0.dat @@ -0,0 +1,28 @@ +16384, 32, 3874, 2.81606e+06, 701, 7427, 514, 1.42438e+06, 2, 64708, 196897, 795300, 0, 4655, 6871, 14726, 11113, +16384, 64, 3851, 1.48309e+06, 359, 5087, 258, 714654, 0, 52223, 266147, 428791, 0, 2325, 5229, 12284, 5954, +16384, 96, 3309, 1.05795e+06, 239, 4273, 171, 477332, 0, 47651, 325699, 332916, 0, 1566, 4939, 12964, 5925, +16384, 128, 3371, 842026, 185, 3824, 128, 358409, 2, 45836, 385389, 279682, 0, 1194, 5296, 12520, 4500, +16384, 160, 3428, 922846, 156, 3673, 104, 287501, 0, 44360, 445170, 371490, 0, 1006, 5589, 13857, 5200, +16384, 192, 3377, 832509, 128, 3554, 97, 239753, 0, 43348, 496618, 316792, 0, 828, 4906, 13327, 4456, +16384, 224, 3389, 769334, 109, 3476, 81, 206525, 0, 43172, 558788, 290050, 0, 719, 4894, 13787, 4480, +16384, 256, 3388, 723629, 100, 3265, 72, 181198, 0, 43714, 631561, 268758, 0, 621, 5096, 13528, 4033, +16384, 288, 3431, 581791, 88, 3386, 60, 161442, 0, 44009, 702162, 323867, 0, 591, 5391, 14569, 3870, +16384, 320, 3414, 566363, 82, 3324, 56, 145847, 0, 44309, 785832, 296541, 0, 541, 5008, 14398, 3565, +16384, 352, 3417, 550270, 76, 3236, 54, 132494, 1, 44838, 867273, 280260, 0, 485, 4948, 14706, 3644, +16384, 384, 3432, 539258, 59, 3230, 49, 121447, 0, 45866, 970138, 267319, 0, 445, 5084, 14520, 3374, +16384, 416, 3453, 457864, 58, 3281, 47, 112829, 0, 46485, 1.08912e+06, 305078, 0, 415, 5331, 15455, 3474, +16384, 448, 3443, 456473, 56, 3267, 43, 104977, 0, 46840, 1.2008e+06, 286867, 0, 394, 5066, 15090, 3240, +16384, 480, 3439, 456111, 56, 3194, 40, 98417, 0, 46964, 1.3165e+06, 276378, 0, 360, 5015, 15722, 3345, +16384, 512, 3503, 451058, 56, 3131, 38, 92571, 0, 46550, 1.43752e+06, 265752, 0, 341, 5123, 15142, 3122, +16384, 640, 3540, 404479, 45, 3136, 30, 74386, 0, 52107, 1.91013e+06, 263700, 0, 295, 5208, 15969, 2921, +16384, 768, 3604, 375343, 35, 3195, 26, 62614, 0, 53861, 2.41925e+06, 263046, 0, 251, 5339, 17009, 2822, +16384, 896, 3633, 351423, 34, 3164, 26, 54055, 0, 55962, 2.92923e+06, 261541, 2, 222, 5577, 18529, 2692, +16384, 1024, 3675, 335494, 29, 3173, 22, 47440, 0, 56226, 3.45338e+06, 257232, 0, 192, 5693, 19563, 2525, +16384, 1152, 3743, 330844, 31, 3250, 22, 42767, 0, 59639, 3.95658e+06, 257109, 0, 178, 5942, 21227, 2527, +16384, 1280, 3728, 321642, 25, 3243, 22, 38772, 0, 60482, 4.48173e+06, 254971, 0, 161, 6196, 22653, 2400, +16384, 1408, 3788, 311028, 23, 3230, 22, 35354, 0, 62836, 4.99148e+06, 253667, 0, 155, 6417, 23681, 2421, +16384, 1536, 3821, 309515, 21, 3269, 19, 32634, 0, 63059, 5.49967e+06, 252429, 0, 141, 6559, 24264, 2326, +16384, 1664, 3876, 308842, 21, 3371, 18, 30283, 0, 67860, 6.05936e+06, 250756, 0, 136, 6922, 26934, 2331, +16384, 1792, 3920, 299093, 21, 3243, 18, 28330, 0, 68619, 6.58756e+06, 252564, 0, 129, 6980, 27584, 2280, +16384, 1920, 3982, 300694, 20, 3368, 16, 26781, 0, 68890, 7.04412e+06, 246158, 0, 116, 7457, 30075, 2230, +16384, 2048, 4054, 295616, 19, 3298, 15, 25032, 0, 70362, 7.68078e+06, 245372, 0, 112, 7859, 29412, 2168, diff --git a/benchmark/bench_CQRRP/results/hopper/ICQRRP_GPU_runtime_breakdown_innerQRF_0_rows_32768_cols_32768_d_factor_1.0.dat b/benchmark/bench_CQRRP/results/hopper/ICQRRP_GPU_runtime_breakdown_innerQRF_0_rows_32768_cols_32768_d_factor_1.0.dat new file mode 100644 index 00000000..a73404fa --- /dev/null +++ b/benchmark/bench_CQRRP/results/hopper/ICQRRP_GPU_runtime_breakdown_innerQRF_0_rows_32768_cols_32768_d_factor_1.0.dat @@ -0,0 +1,28 @@ +32768, 32, 12091, 1.38733e+07, 1911, 21690, 1027, 1.09823e+07, 103, 209084, 419098, 6.02357e+06, 0, 10560, 14777, 69232, 27544, +32768, 64, 12087, 7.22781e+06, 981, 16198, 512, 5.49344e+06, 55, 159588, 516448, 3.25082e+06, 0, 5289, 10858, 55779, 14498, +32768, 96, 12136, 5.52086e+06, 674, 14678, 344, 3.68883e+06, 46, 123007, 646597, 2.6583e+06, 0, 3625, 10389, 66613, 69912, +32768, 128, 12147, 4.35984e+06, 497, 13682, 258, 2.77863e+06, 20, 122444, 779294, 2.3282e+06, 0, 2796, 11043, 50389, 49163, +32768, 160, 12157, 4.24492e+06, 398, 13493, 252, 2.22189e+06, 25, 129283, 964112, 3.02833e+06, 0, 2393, 12737, 35569, 13014, +32768, 192, 12239, 3.8268e+06, 353, 13471, 220, 1.85618e+06, 16, 128954, 1.20054e+06, 2.59446e+06, 0, 1982, 10974, 34784, 10799, +32768, 224, 12179, 3.53696e+06, 305, 13500, 195, 1.59179e+06, 21, 128422, 1.46573e+06, 2.41977e+06, 0, 1749, 10544, 35350, 10774, +32768, 256, 12339, 3.29582e+06, 253, 12683, 193, 1.39269e+06, 10, 127622, 1.72211e+06, 2.28666e+06, 0, 1563, 10825, 34554, 9537, +32768, 288, 12190, 2.62223e+06, 242, 12951, 136, 1.24266e+06, 20, 130485, 2.02626e+06, 2.69448e+06, 0, 1458, 12314, 41106, 9191, +32768, 320, 12455, 2.54116e+06, 209, 12780, 144, 1.12348e+06, 12, 129970, 2.29509e+06, 2.46547e+06, 0, 1328, 11223, 40472, 8229, +32768, 352, 12358, 2.49298e+06, 207, 12914, 136, 1.02146e+06, 17, 133043, 2.58727e+06, 2.35459e+06, 0, 1219, 10908, 39918, 8586, +32768, 384, 12393, 2.41092e+06, 177, 12802, 125, 935202, 8, 136808, 2.87374e+06, 2.2747e+06, 0, 1128, 11312, 39966, 7834, +32768, 416, 12405, 2.00295e+06, 185, 12878, 110, 867291, 11, 138354, 3.18845e+06, 2.5579e+06, 0, 1092, 12331, 45953, 7971, +32768, 448, 12364, 1.9943e+06, 162, 12930, 109, 804324, 10, 138911, 3.48835e+06, 2.40601e+06, 0, 1000, 11652, 46363, 7338, +32768, 480, 12385, 1.99234e+06, 151, 12591, 99, 754757, 10, 139696, 3.77198e+06, 2.326e+06, 0, 952, 11426, 45317, 7604, +32768, 512, 12532, 1.96915e+06, 144, 12335, 101, 705269, 9, 139939, 4.07307e+06, 2.26778e+06, 0, 869, 11584, 44286, 7029, +32768, 640, 12695, 1.69387e+06, 113, 12495, 81, 567460, 10, 151393, 5.2254e+06, 2.26494e+06, 0, 740, 12096, 49656, 6381, +32768, 768, 12649, 1.50874e+06, 101, 12467, 72, 474456, 14, 158637, 6.40286e+06, 2.26204e+06, 0, 630, 12268, 53447, 5988, +32768, 896, 12765, 1.38197e+06, 83, 12698, 60, 408121, 8, 166626, 7.59016e+06, 2.26032e+06, 0, 561, 12866, 58842, 5680, +32768, 1024, 12851, 1.28038e+06, 90, 12420, 53, 357640, 5, 172198, 8.79064e+06, 2.2539e+06, 0, 508, 13331, 63307, 5453, +32768, 1152, 12961, 1.20278e+06, 68, 12261, 53, 318991, 7, 178416, 9.91563e+06, 2.24658e+06, 0, 444, 13796, 68087, 5241, +32768, 1280, 13025, 1.14205e+06, 60, 12271, 48, 287502, 3, 180570, 1.10696e+07, 2.24314e+06, 0, 405, 14083, 72575, 5055, +32768, 1408, 13119, 1.09387e+06, 63, 12318, 43, 262848, 4, 191118, 1.22628e+07, 2.23745e+06, 0, 379, 14919, 78016, 5022, +32768, 1536, 13195, 1.05255e+06, 61, 12306, 40, 241483, 3, 192965, 1.34275e+07, 2.23252e+06, 0, 376, 15297, 82061, 4931, +32768, 1664, 13260, 1.0278e+06, 57, 12324, 39, 222991, 3, 206304, 1.45441e+07, 2.22638e+06, 0, 320, 15862, 85423, 4800, +32768, 1792, 13301, 998307, 50, 12314, 36, 207439, 3, 209569, 1.57024e+07, 2.22135e+06, 0, 302, 15907, 90096, 4685, +32768, 1920, 13397, 971225, 52, 12161, 38, 194393, 2, 217073, 1.69279e+07, 2.20556e+06, 0, 306, 17104, 98634, 4681, +32768, 2048, 13554, 958486, 48, 12457, 36, 182590, 1, 217918, 1.81017e+07, 2.19866e+06, 0, 265, 17487, 99593, 4512, diff --git a/benchmark/bench_CQRRP/results/hopper/ICQRRP_GPU_runtime_breakdown_innerQRF_0_rows_4423_cols_4423_d_factor_1.0.dat b/benchmark/bench_CQRRP/results/hopper/ICQRRP_GPU_runtime_breakdown_innerQRF_0_rows_4423_cols_4423_d_factor_1.0.dat new file mode 100644 index 00000000..7f1490a0 --- /dev/null +++ b/benchmark/bench_CQRRP/results/hopper/ICQRRP_GPU_runtime_breakdown_innerQRF_0_rows_4423_cols_4423_d_factor_1.0.dat @@ -0,0 +1,9 @@ +4423, 32, 962, 180552, 143, 1268, 140, 37258, 0, 14812, 52871, 38684, 0, 1121, 1793, 3734, 2684, +4423, 64, 967, 107750, 71, 760, 70, 18874, 0, 13523, 72068, 29858, 0, 570, 1340, 3075, 1498, +4423, 96, 816, 85997, 48, 576, 47, 12727, 0, 12514, 84858, 27850, 0, 386, 1256, 3128, 1458, +4423, 128, 842, 76116, 35, 484, 35, 9670, 0, 12657, 104326, 26668, 0, 295, 1346, 3030, 1147, +4423, 160, 909, 79025, 29, 437, 28, 7800, 0, 12223, 120603, 17346, 0, 240, 1398, 3205, 1214, +4423, 192, 1095, 73755, 25, 401, 24, 6562, 0, 12385, 136788, 16048, 0, 206, 1271, 3189, 1090, +4423, 224, 984, 69997, 21, 371, 20, 5640, 0, 11942, 149444, 15189, 0, 169, 1252, 3130, 1074, +4423, 256, 948, 67585, 18, 349, 18, 4995, 0, 12250, 164763, 14904, 0, 154, 1298, 3149, 980, +4423, 288, 1066, 59112, 18, 346, 16, 4496, 1, 12027, 179897, 15936, 0, 137, 1331, 3231, 991, diff --git a/benchmark/bench_CQRRP/results/hopper/ICQRRP_GPU_runtime_breakdown_innerQRF_0_rows_8191_cols_8191_d_factor_1.0.dat b/benchmark/bench_CQRRP/results/hopper/ICQRRP_GPU_runtime_breakdown_innerQRF_0_rows_8191_cols_8191_d_factor_1.0.dat new file mode 100644 index 00000000..7ffb92fa --- /dev/null +++ b/benchmark/bench_CQRRP/results/hopper/ICQRRP_GPU_runtime_breakdown_innerQRF_0_rows_8191_cols_8191_d_factor_1.0.dat @@ -0,0 +1,16 @@ +8191, 32, 1560, 611256, 263, 2779, 260, 213840, 0, 30730, 96829, 129053, 0, 2157, 3371, 7088, 5144, +8191, 64, 1388, 339578, 133, 1743, 128, 107275, 0, 24700, 125008, 80620, 0, 1072, 2529, 5790, 2752, +8191, 96, 1377, 257136, 96, 1391, 86, 72073, 0, 23589, 159898, 74051, 0, 743, 2432, 6114, 2708, +8191, 128, 1389, 214742, 70, 1213, 64, 54293, 0, 22166, 190915, 65255, 0, 556, 2572, 5878, 2116, +8191, 160, 1426, 233326, 59, 1126, 52, 43734, 0, 22385, 218968, 65499, 0, 457, 2699, 6361, 2323, +8191, 192, 1381, 214915, 46, 1059, 44, 36594, 0, 21966, 250065, 57997, 0, 382, 2410, 6153, 1971, +8191, 224, 1542, 202534, 39, 1014, 38, 31373, 0, 21929, 278958, 56216, 0, 335, 2409, 6383, 2039, +8191, 256, 1465, 191311, 35, 943, 33, 27587, 0, 21485, 305484, 52133, 0, 291, 2457, 6116, 1808, +8191, 288, 1455, 159833, 32, 960, 29, 24669, 0, 21547, 332860, 59035, 0, 262, 2579, 6517, 1817, +8191, 320, 1661, 157402, 31, 955, 26, 22323, 0, 21858, 368440, 55247, 0, 234, 2445, 6478, 1681, +8191, 352, 1432, 154435, 26, 921, 24, 20476, 0, 22037, 393728, 54132, 0, 214, 2423, 6619, 1739, +8191, 384, 1413, 152304, 24, 914, 22, 18836, 0, 22360, 423278, 52204, 0, 200, 2476, 6545, 1633, +8191, 416, 1631, 133581, 23, 915, 20, 17507, 0, 22205, 457232, 56510, 0, 185, 2592, 6744, 1679, +8191, 448, 1428, 133542, 23, 927, 19, 16332, 0, 22107, 483465, 53712, 0, 172, 2446, 6802, 1587, +8191, 480, 1624, 132466, 23, 886, 18, 15280, 0, 21918, 514337, 52911, 0, 170, 2445, 6946, 1643, +8191, 512, 1692, 132460, 19, 865, 16, 14339, 0, 21941, 551440, 50950, 0, 149, 2501, 6618, 1522, diff --git a/benchmark/bench_CQRRP/results/hopper/ICQRRP_GPU_runtime_breakdown_innerQRF_1_rows_16384_cols_16384_d_factor_1.0.dat b/benchmark/bench_CQRRP/results/hopper/ICQRRP_GPU_runtime_breakdown_innerQRF_1_rows_16384_cols_16384_d_factor_1.0.dat new file mode 100644 index 00000000..15d1ba7e --- /dev/null +++ b/benchmark/bench_CQRRP/results/hopper/ICQRRP_GPU_runtime_breakdown_innerQRF_1_rows_16384_cols_16384_d_factor_1.0.dat @@ -0,0 +1,28 @@ +16384, 32, 22511, 2.82413e+06, 686, 7302, 512, 1.42447e+06, 3, 274972, 0, 801411, 0, 4571, 6846, 14867, 12229, +16384, 64, 21528, 1.49259e+06, 360, 5043, 256, 716070, 0, 275093, 0, 433728, 0, 2314, 5215, 12252, 7496, +16384, 96, 21525, 1.07054e+06, 234, 4217, 173, 477500, 0, 283359, 0, 337069, 0, 1578, 4934, 12951, 6548, +16384, 128, 21832, 853834, 179, 3801, 128, 358674, 0, 283669, 0, 284133, 0, 1202, 5298, 12519, 4899, +16384, 160, 22109, 934163, 150, 3654, 105, 287737, 0, 287007, 0, 375583, 0, 1028, 5594, 13856, 6204, +16384, 192, 22067, 842246, 129, 3535, 91, 239964, 1, 286481, 0, 320615, 0, 852, 4904, 13301, 5281, +16384, 224, 22193, 780846, 119, 3459, 79, 206836, 1, 289788, 0, 293210, 0, 732, 4854, 13757, 5588, +16384, 256, 21369, 736116, 99, 3269, 74, 181519, 0, 291597, 0, 271982, 0, 667, 5078, 13564, 4562, +16384, 288, 21463, 593412, 86, 3340, 62, 161487, 0, 294094, 0, 327190, 0, 616, 5377, 14591, 4540, +16384, 320, 22044, 576905, 78, 3322, 57, 145818, 0, 295942, 0, 299827, 0, 568, 5017, 14426, 4167, +16384, 352, 21560, 564315, 71, 3248, 50, 132607, 0, 297680, 0, 283438, 0, 503, 4909, 14718, 4364, +16384, 384, 22028, 555311, 66, 3241, 49, 121872, 0, 300521, 0, 269865, 0, 477, 5059, 14513, 3795, +16384, 416, 21128, 469471, 63, 3279, 41, 112876, 0, 301868, 0, 308103, 0, 461, 5297, 15387, 4076, +16384, 448, 21964, 465611, 61, 3249, 39, 105031, 0, 303389, 0, 289835, 0, 428, 5031, 15059, 3741, +16384, 480, 21222, 466433, 55, 3201, 38, 98442, 0, 304553, 0, 279534, 0, 411, 5003, 15703, 4028, +16384, 512, 21251, 462642, 52, 3082, 40, 92315, 0, 305170, 0, 268599, 0, 385, 5083, 15127, 3370, +16384, 640, 21275, 419188, 43, 3168, 34, 74586, 1, 310627, 0, 267012, 0, 328, 5197, 16084, 3204, +16384, 768, 21368, 389000, 38, 3199, 28, 62761, 0, 311761, 0, 265935, 0, 285, 5262, 17001, 3058, +16384, 896, 22119, 359547, 34, 3163, 26, 53896, 0, 314161, 0, 264404, 0, 267, 5438, 18056, 3036, +16384, 1024, 22027, 349446, 27, 3173, 23, 47754, 0, 310276, 0, 259917, 0, 253, 5569, 19186, 3032, +16384, 1152, 22334, 342498, 27, 3213, 25, 42696, 0, 310316, 0, 259589, 0, 242, 5951, 21128, 2802, +16384, 1280, 21534, 337948, 23, 3231, 21, 38660, 0, 313206, 0, 257804, 0, 231, 6056, 22285, 2754, +16384, 1408, 21566, 333091, 24, 3296, 18, 35581, 0, 315466, 0, 256054, 0, 222, 6279, 23388, 2832, +16384, 1536, 22421, 325544, 21, 3318, 18, 32943, 0, 315310, 0, 254849, 0, 205, 6511, 24057, 2601, +16384, 1664, 21644, 320889, 22, 3347, 19, 30530, 0, 317137, 0, 252540, 0, 198, 6736, 26455, 2638, +16384, 1792, 22661, 314052, 18, 3290, 17, 28387, 0, 317611, 0, 255221, 0, 198, 7055, 27531, 2582, +16384, 1920, 22149, 312985, 17, 3324, 18, 26704, 0, 316282, 0, 248558, 0, 192, 7268, 29507, 2571, +16384, 2048, 22078, 312047, 16, 3332, 15, 25101, 0, 320374, 0, 247074, 0, 185, 7643, 28389, 2381, diff --git a/benchmark/bench_CQRRP/results/hopper/ICQRRP_GPU_runtime_breakdown_innerQRF_1_rows_32768_cols_32768_d_factor_1.0.dat b/benchmark/bench_CQRRP/results/hopper/ICQRRP_GPU_runtime_breakdown_innerQRF_1_rows_32768_cols_32768_d_factor_1.0.dat new file mode 100644 index 00000000..ce00e1fd --- /dev/null +++ b/benchmark/bench_CQRRP/results/hopper/ICQRRP_GPU_runtime_breakdown_innerQRF_1_rows_32768_cols_32768_d_factor_1.0.dat @@ -0,0 +1,28 @@ +32768, 32, 29971, 1.38905e+07, 1897, 21580, 1026, 1.10204e+07, 131, 692105, 0, 6.05572e+06, 0, 10726, 14706, 51519, 27533, +32768, 64, 29566, 7.2257e+06, 978, 16086, 512, 5.49152e+06, 72, 696019, 0, 3.25403e+06, 0, 5263, 10843, 43048, 13054, +32768, 96, 30767, 5.28202e+06, 667, 14527, 342, 3.67563e+06, 39, 713283, 0, 2.63718e+06, 0, 3570, 10286, 44395, 41037, +32768, 128, 30806, 4.18919e+06, 481, 13622, 256, 2.7824e+06, 21, 724813, 0, 2.31274e+06, 0, 2782, 10980, 35770, 29410, +32768, 160, 30150, 4.24264e+06, 403, 13389, 241, 2.22238e+06, 23, 743578, 0, 3.03204e+06, 0, 2445, 12693, 35596, 15483, +32768, 192, 30190, 3.83941e+06, 346, 13412, 216, 1.856e+06, 8, 747617, 0, 2.59708e+06, 0, 2046, 10939, 34806, 12970, +32768, 224, 30833, 3.54506e+06, 304, 13406, 198, 1.59204e+06, 17, 752984, 0, 2.42403e+06, 0, 1778, 10587, 35635, 13377, +32768, 256, 30570, 3.31794e+06, 253, 12670, 189, 1.39367e+06, 9, 749365, 0, 2.28624e+06, 0, 1583, 10945, 34879, 10865, +32768, 288, 30197, 2.62634e+06, 243, 12994, 149, 1.24256e+06, 14, 766783, 0, 2.6975e+06, 0, 1533, 12341, 41091, 10824, +32768, 320, 30327, 2.55502e+06, 220, 12890, 139, 1.12164e+06, 13, 761511, 0, 2.4682e+06, 0, 1385, 11240, 40600, 9675, +32768, 352, 30347, 2.5041e+06, 205, 12961, 135, 1.02072e+06, 11, 765876, 0, 2.35601e+06, 0, 1270, 10945, 40002, 10197, +32768, 384, 30265, 2.43613e+06, 177, 12912, 125, 936491, 12, 768543, 0, 2.277e+06, 0, 1201, 11336, 40201, 8740, +32768, 416, 30363, 2.02702e+06, 184, 13019, 120, 868208, 7, 782076, 0, 2.56018e+06, 0, 1151, 12375, 46574, 9173, +32768, 448, 30281, 2.00744e+06, 166, 12902, 102, 806413, 9, 785913, 0, 2.40813e+06, 0, 1077, 11570, 46024, 8586, +32768, 480, 31007, 2.01252e+06, 150, 12789, 96, 753945, 13, 785663, 0, 2.327e+06, 0, 1026, 11411, 45243, 9038, +32768, 512, 30452, 1.98726e+06, 140, 12344, 97, 706254, 14, 788132, 0, 2.26967e+06, 0, 956, 11629, 44593, 7671, +32768, 640, 30431, 1.71607e+06, 117, 12673, 85, 567153, 7, 791395, 0, 2.26525e+06, 0, 818, 12040, 49464, 7219, +32768, 768, 30527, 1.51851e+06, 103, 12633, 70, 474462, 13, 794291, 0, 2.26808e+06, 0, 727, 12375, 53724, 6700, +32768, 896, 31007, 1.3904e+06, 94, 12456, 63, 407205, 3, 809529, 0, 2.2592e+06, 0, 645, 12797, 58924, 6666, +32768, 1024, 30947, 1.28833e+06, 83, 12141, 59, 357464, 7, 805354, 0, 2.25889e+06, 0, 591, 13493, 64234, 6106, +32768, 1152, 30933, 1.22298e+06, 75, 12234, 55, 318533, 6, 811642, 0, 2.25649e+06, 0, 529, 14011, 68946, 6011, +32768, 1280, 30865, 1.15589e+06, 68, 12348, 48, 287992, 5, 811648, 0, 2.24968e+06, 0, 482, 14077, 73214, 5723, +32768, 1408, 31061, 1.11636e+06, 57, 12322, 44, 263155, 2, 825140, 0, 2.25054e+06, 0, 476, 14935, 78342, 5732, +32768, 1536, 31781, 1.07658e+06, 59, 12321, 38, 241619, 1, 829461, 0, 2.2416e+06, 0, 425, 15320, 83312, 5437, +32768, 1664, 31405, 1.04367e+06, 54, 12473, 38, 223351, 5, 836098, 0, 2.23641e+06, 0, 416, 15960, 87184, 5495, +32768, 1792, 31490, 1.01566e+06, 49, 12417, 38, 207856, 1, 840634, 0, 2.23574e+06, 0, 396, 16131, 92237, 5273, +32768, 1920, 31400, 989077, 51, 12132, 39, 194381, 3, 846721, 0, 2.22342e+06, 0, 379, 17485, 100466, 5363, +32768, 2048, 31609, 976225, 48, 12444, 35, 182595, 3, 842063, 0, 2.21046e+06, 0, 354, 17758, 102741, 4943, diff --git a/benchmark/bench_CQRRP/results/hopper/ICQRRP_GPU_runtime_breakdown_innerQRF_1_rows_4423_cols_4423_d_factor_1.0.dat b/benchmark/bench_CQRRP/results/hopper/ICQRRP_GPU_runtime_breakdown_innerQRF_1_rows_4423_cols_4423_d_factor_1.0.dat new file mode 100644 index 00000000..835f2c05 --- /dev/null +++ b/benchmark/bench_CQRRP/results/hopper/ICQRRP_GPU_runtime_breakdown_innerQRF_1_rows_4423_cols_4423_d_factor_1.0.dat @@ -0,0 +1,9 @@ +4423, 32, 18994, 191200, 139, 1240, 142, 37255, 0, 38023, 0, 46091, 0, 1108, 1769, 3767, 4683, +4423, 64, 19138, 118355, 70, 746, 70, 18971, 0, 36813, 0, 37488, 0, 583, 1339, 3104, 3609, +4423, 96, 18894, 99414, 48, 569, 47, 12786, 0, 39599, 0, 32428, 0, 400, 1261, 3172, 1614, +4423, 128, 18871, 88816, 36, 479, 35, 9688, 0, 40667, 0, 30557, 0, 308, 1348, 3053, 1267, +4423, 160, 19182, 91738, 29, 431, 28, 7783, 0, 40359, 0, 20806, 0, 262, 1420, 3210, 1476, +4423, 192, 19052, 86250, 24, 405, 24, 6586, 0, 40700, 0, 19543, 0, 299, 1300, 3214, 1276, +4423, 224, 19267, 81826, 20, 373, 20, 5648, 0, 40876, 0, 18718, 0, 196, 1264, 3148, 1323, +4423, 256, 18983, 80346, 19, 347, 19, 4997, 0, 41716, 0, 18220, 0, 175, 1296, 3149, 1207, +4423, 288, 19148, 71435, 16, 342, 16, 4486, 0, 42127, 0, 19298, 0, 162, 1340, 3231, 1183, diff --git a/benchmark/bench_CQRRP/results/hopper/ICQRRP_GPU_runtime_breakdown_innerQRF_1_rows_8191_cols_8191_d_factor_1.0.dat b/benchmark/bench_CQRRP/results/hopper/ICQRRP_GPU_runtime_breakdown_innerQRF_1_rows_8191_cols_8191_d_factor_1.0.dat new file mode 100644 index 00000000..ae6bb508 --- /dev/null +++ b/benchmark/bench_CQRRP/results/hopper/ICQRRP_GPU_runtime_breakdown_innerQRF_1_rows_8191_cols_8191_d_factor_1.0.dat @@ -0,0 +1,16 @@ +8191, 32, 19266, 708985, 266, 2730, 256, 214199, 0, 92573, 0, 137250, 0, 2104, 3312, 7101, 6865, +8191, 64, 19804, 350114, 137, 1726, 129, 107597, 0, 91138, 0, 88735, 0, 1084, 2526, 5841, 4778, +8191, 96, 19706, 268792, 92, 1391, 86, 71977, 0, 97774, 0, 78018, 0, 738, 2429, 6134, 3023, +8191, 128, 19003, 227161, 71, 1195, 64, 54164, 0, 96981, 0, 69187, 0, 573, 2582, 5914, 2355, +8191, 160, 19795, 245348, 60, 1111, 52, 43720, 0, 98859, 0, 69095, 0, 552, 2727, 6382, 2753, +8191, 192, 19366, 225753, 45, 1054, 45, 36683, 0, 98599, 0, 61314, 0, 397, 2503, 6150, 2315, +8191, 224, 19317, 213385, 46, 1011, 38, 31541, 0, 100097, 0, 59585, 0, 352, 2408, 6366, 2513, +8191, 256, 19333, 202648, 38, 929, 32, 27751, 0, 100150, 0, 55447, 0, 316, 2536, 6119, 2058, +8191, 288, 19971, 171099, 34, 963, 29, 24772, 0, 101409, 0, 62503, 0, 287, 2600, 6530, 2159, +8191, 320, 19281, 168032, 28, 944, 26, 22384, 0, 100974, 0, 58430, 0, 262, 2426, 6408, 1965, +8191, 352, 20347, 166104, 26, 915, 27, 20414, 0, 102412, 0, 57407, 0, 239, 2403, 6624, 2144, +8191, 384, 20017, 163708, 26, 907, 22, 18818, 0, 102029, 0, 55183, 0, 231, 2463, 6520, 1795, +8191, 416, 20043, 144702, 21, 914, 20, 17504, 0, 102866, 0, 59997, 0, 211, 2568, 6775, 1940, +8191, 448, 19504, 144847, 22, 934, 19, 16332, 0, 103305, 0, 56838, 0, 193, 2430, 6807, 1830, +8191, 480, 19654, 144833, 25, 884, 21, 15321, 0, 103573, 0, 55939, 0, 188, 2415, 6985, 1961, +8191, 512, 19235, 143868, 17, 869, 17, 14393, 0, 103455, 0, 54066, 0, 179, 2497, 6633, 1635, diff --git a/benchmark/bench_CQRRP/results/hopper/ICQRRP_GPU_speed_rows_16384_cols_16384_d_factor_1.0.dat b/benchmark/bench_CQRRP/results/hopper/ICQRRP_GPU_speed_rows_16384_cols_16384_d_factor_1.0.dat new file mode 100644 index 00000000..27158d63 --- /dev/null +++ b/benchmark/bench_CQRRP/results/hopper/ICQRRP_GPU_speed_rows_16384_cols_16384_d_factor_1.0.dat @@ -0,0 +1,28 @@ +16384 16384 32 5398 5351 0 +16384 16384 64 2976 2984 0 +16384 16384 96 2224 2279 0 +16384 16384 128 1834 1946 481 +16384 16384 160 1941 2108 0 +16384 16384 192 1743 1963 0 +16384 16384 224 1625 1903 0 +16384 16384 256 1534 1883 0 +16384 16384 288 1430 1848 0 +16384 16384 320 1372 1873 0 +16384 16384 352 1331 1909 0 +16384 16384 384 1301 1978 0 +16384 16384 416 1246 2047 0 +16384 16384 448 1217 2130 0 +16384 16384 480 1202 2229 0 +16384 16384 512 1181 2328 0 +16384 16384 640 1125 2740 0 +16384 16384 768 1084 3210 0 +16384 16384 896 1048 3690 0 +16384 16384 1024 1025 4189 0 +16384 16384 1152 1015 4688 0 +16384 16384 1280 1008 5200 0 +16384 16384 1408 1002 5698 0 +16384 16384 1536 992 6202 0 +16384 16384 1664 986 6765 0 +16384 16384 1792 983 7285 0 +16384 16384 1920 974 7738 0 +16384 16384 2048 973 8369 0 diff --git a/benchmark/bench_CQRRP/results/hopper/ICQRRP_GPU_speed_rows_32768_cols_32768_d_factor_1.0.dat b/benchmark/bench_CQRRP/results/hopper/ICQRRP_GPU_speed_rows_32768_cols_32768_d_factor_1.0.dat new file mode 100644 index 00000000..b3d770a8 --- /dev/null +++ b/benchmark/bench_CQRRP/results/hopper/ICQRRP_GPU_speed_rows_32768_cols_32768_d_factor_1.0.dat @@ -0,0 +1,28 @@ +32768 32768 32 31831 31680 0 +32768 32768 64 16800 16778 0 +32768 32768 96 12467 12829 0 +32768 32768 128 10147 10522 2590 +32768 32768 160 10364 10692 0 +32768 32768 192 9158 9705 0 +32768 32768 224 8434 9241 0 +32768 32768 256 7863 8920 0 +32768 32768 288 7456 8819 0 +32768 32768 320 7027 8656 0 +32768 32768 352 6767 8689 0 +32768 32768 384 6537 8731 0 +32768 32768 416 6364 8862 0 +32768 32768 448 6132 8937 0 +32768 32768 480 6004 9089 0 +32768 32768 512 5873 9258 0 +32768 32768 640 5466 10011 0 +32768 32768 768 5186 10918 0 +32768 32768 896 5003 11925 0 +32768 32768 1024 4852 12977 0 +32768 32768 1152 4757 13990 0 +32768 32768 1280 4656 15055 0 +32768 32768 1408 4613 16186 0 +32768 32768 1536 4552 17290 0 +32768 32768 1664 4507 18374 0 +32768 32768 1792 4472 19491 0 +32768 32768 1920 4436 20677 0 +32768 32768 2048 4396 21822 0 diff --git a/benchmark/bench_CQRRP/results/hopper/ICQRRP_GPU_speed_rows_4423_cols_4423_d_factor_1.0.dat b/benchmark/bench_CQRRP/results/hopper/ICQRRP_GPU_speed_rows_4423_cols_4423_d_factor_1.0.dat new file mode 100644 index 00000000..cbe42359 --- /dev/null +++ b/benchmark/bench_CQRRP/results/hopper/ICQRRP_GPU_speed_rows_4423_cols_4423_d_factor_1.0.dat @@ -0,0 +1,9 @@ +4423 4423 32 345 337 0 +4423 4423 64 241 251 0 +4423 4423 96 211 232 0 +4423 4423 128 196 237 44 +4423 4423 160 187 245 0 +4423 4423 192 179 253 0 +4423 4423 224 173 260 0 +4423 4423 256 171 272 0 +4423 4423 288 163 279 0 diff --git a/benchmark/bench_CQRRP/results/hopper/ICQRRP_GPU_speed_rows_8191_cols_8191_d_factor_1.0.dat b/benchmark/bench_CQRRP/results/hopper/ICQRRP_GPU_speed_rows_8191_cols_8191_d_factor_1.0.dat new file mode 100644 index 00000000..2e4918f4 --- /dev/null +++ b/benchmark/bench_CQRRP/results/hopper/ICQRRP_GPU_speed_rows_8191_cols_8191_d_factor_1.0.dat @@ -0,0 +1,16 @@ +8191 8191 32 1196 1106 0 +8191 8191 64 675 694 0 +8191 8191 96 551 603 0 +8191 8191 128 480 562 129 +8191 8191 160 492 600 0 +8191 8191 192 455 596 0 +8191 8191 224 438 606 0 +8191 8191 256 419 612 0 +8191 8191 288 394 613 0 +8191 8191 320 382 640 0 +8191 8191 352 380 660 0 +8191 8191 384 373 683 0 +8191 8191 416 359 702 0 +8191 8191 448 354 724 0 +8191 8191 480 353 752 0 +8191 8191 512 348 786 0 diff --git a/benchmark/bench_CQRRPT/CQRRPT_pivot_quality.cc b/benchmark/bench_CQRRPT/CQRRPT_pivot_quality.cc index ebd20cbf..f83c2b76 100644 --- a/benchmark/bench_CQRRPT/CQRRPT_pivot_quality.cc +++ b/benchmark/bench_CQRRPT/CQRRPT_pivot_quality.cc @@ -50,14 +50,14 @@ static void data_regen(RandLAPACK::gen::mat_gen_info m_info, // Re-generate and clear data template -static std::vector get_norms( QR_benchmark_data &all_data) { - - int64_t m = all_data.row; - int64_t n = all_data.col; +static std::vector get_norms( int64_t m, int64_t n, std::vector Mat, int64_t lda) { std::vector R_norms (n, 0.0); - for (int i = 0; i < n; ++i) - R_norms[i] = lapack::lantr(Norm::Fro, Uplo::Upper, Diag::NonUnit, n - i, n - i, &all_data.A.data()[(m + 1) * i], m); + for (int i = 0; i < n; ++i) { + R_norms[i] = lapack::lantr(Norm::Fro, Uplo::Upper, Diag::NonUnit, n - i, n - i, &Mat[(lda + 1) * i], lda); + if (i < 10) + printf("%e\n", R_norms[i]); + } return R_norms; } @@ -72,31 +72,42 @@ static void R_norm_ratio( auto tol = all_data.tolerance; auto d_factor = all_data.sampling_factor; + auto state_alg = state; + auto state_gen = state; + // Additional params setup. RandLAPACK::CQRRPT CQRRPT(true, tol); CQRRPT.nnz = 4; CQRRPT.num_threads = 48; - // Running HQRRP + // Running GEQP3 lapack::geqp3(m, n, all_data.A.data(), m, all_data.J.data(), all_data.tau.data()); - std::vector R_norms_HQRRP = get_norms(all_data); + std::vector R_norms_GEQP3 = get_norms(m, n, all_data.A, m); + printf("\nDone with QP3\n"); // Clear and re-generate data - data_regen(m_info, all_data, state); + state_gen = state; + data_regen(m_info, all_data, state_gen); + printf("\nStarting CQRRPT\n"); // Running CQRRP - CQRRPT.call(m, n, all_data.A.data(), m, all_data.R.data(), n, all_data.J.data(), d_factor, state); - std::vector R_norms_CQRRPT = get_norms(all_data); + state_alg = state; + CQRRPT.call(m, n, all_data.A.data(), m, all_data.R.data(), n, all_data.J.data(), d_factor, state_alg); + std::vector R_norms_CQRRPT = get_norms(n, n, all_data.R, n); // Declare a data file - std::fstream file1("data_out/QR_R_norm_ratios_rows_" + std::to_string(m) + std::fstream file1("QR_R_norm_ratios_rows_" + std::to_string(m) + "_cols_" + std::to_string(n) + "_d_factor_" + std::to_string(d_factor) + ".dat", std::fstream::app); // Write the 1st metric info into a file. for (int i = 0; i < n; ++i) - file1 << R_norms_HQRRP[i] / R_norms_CQRRPT[i] << ", "; + file1 << R_norms_GEQP3[i] / R_norms_CQRRPT[i] << ", "; + + // Clear and re-generate data + state_gen = state; + data_regen(m_info, all_data, state_gen); } template @@ -112,14 +123,15 @@ static void sv_ratio( std::vector geqp3 (n, 0.0); std::vector sv_ratios_cqrrp (n, 0.0); - auto state1 = state; + auto state_alg = state; + auto state_gen = state; // Additional params setup. RandLAPACK::CQRRPT CQRRPT(true, tol); CQRRPT.nnz = 4; CQRRPT.num_threads = 48; - std::fstream file2("data_out/QR_sv_ratios_rows_" + std::to_string(m) + std::fstream file2("QR_sv_ratios_rows_" + std::to_string(m) + "_cols_" + std::to_string(n) + "_d_factor_" + std::to_string(d_factor) + ".dat", std::fstream::app); @@ -131,10 +143,10 @@ static void sv_ratio( lapack::gesdd(Job::NoVec, m, n, all_data.A.data(), m, all_data.S.data(), (T*) nullptr, m, (T*) nullptr, n); // Clear and re-generate data - data_regen(m_info, all_data, state); + state_gen = state; + data_regen(m_info, all_data, state_gen); // Running GEQP3 - std::iota(all_data.J.begin(), all_data.J.end(), 1); lapack::geqp3(m, n, all_data.A.data(), m, all_data.J.data(), all_data.tau.data()); // Write the 2nd metric info into a file. @@ -143,20 +155,27 @@ static void sv_ratio( file2 << ",\n"; // Clear and re-generate data - data_regen(m_info, all_data, state1); + state_gen = state; + data_regen(m_info, all_data, state_gen); // Running CQRRP - CQRRPT.call(m, n, all_data.A.data(), m, all_data.R.data(), n, all_data.J.data(), d_factor, state); + state_alg = state; + CQRRPT.call(m, n, all_data.A.data(), m, all_data.R.data(), n, all_data.J.data(), d_factor, state_alg); + R_dat = all_data.R.data(); // Write the 2nd metric info into a file. for (int i = 0; i < n; ++i) - file2 << std::abs(R_dat[(m + 1) * i] / S_dat[i]) << ", "; + file2 << std::abs(R_dat[(n + 1) * i] / S_dat[i]) << ", "; + + // Clear and re-generate data + state_gen = state; + data_regen(m_info, all_data, state_gen); } int main() { // Declare parameters int64_t m = std::pow(2, 17); - int64_t n = std::pow(2, 11); + int64_t n = 2000; double d_factor = 1.25; double tol = std::pow(std::numeric_limits::epsilon(), 0.85); auto state = RandBLAS::RNGState(); @@ -171,10 +190,11 @@ int main() { // Generate the input matrix: // polynomial & step for low coherence; // spiked for high coherence. - RandLAPACK::gen::mat_gen_info m_info(m, n, RandLAPACK::gen::polynomial); + RandLAPACK::gen::mat_gen_info m_info(m, n, RandLAPACK::gen::spiked); m_info.cond_num = std::pow(10, 10); m_info.rank = n; m_info.exponent = 2.0; + m_info.scaling = std::pow(10, 10); RandLAPACK::gen::mat_gen(m_info, all_data.A.data(), state); R_norm_ratio(m_info, all_data, state_constant1); diff --git a/benchmark/bench_RBKI/RBKI_speed_comparisons_SVDS.cc b/benchmark/bench_RBKI/RBKI_speed_comparisons_SVDS.cc index 06f2c362..06d64895 100644 --- a/benchmark/bench_RBKI/RBKI_speed_comparisons_SVDS.cc +++ b/benchmark/bench_RBKI/RBKI_speed_comparisons_SVDS.cc @@ -343,7 +343,7 @@ static void call_all_algs( << residual_err_custom_RBKI << ", " << lowrank_err_RBKI << ", " << dur_rbki << ", " << residual_err_custom_RSVD << ", " << lowrank_err_RSVD << ", " << dur_rsvd << ", " << residual_err_custom_SVDS << ", " << lowrank_err_SVDS << ", " << dur_svds << ", " - << ", " << lowrank_err_SVD << ", " << dur_svd << ",\n"; + << residual_err_custom_SVD << ", " << lowrank_err_SVD << ", " << dur_svd << ",\n"; } } diff --git a/benchmark/bench_general/basic_blas_speed.cc b/benchmark/bench_general/basic_blas_speed.cc new file mode 100644 index 00000000..2297a733 --- /dev/null +++ b/benchmark/bench_general/basic_blas_speed.cc @@ -0,0 +1,145 @@ +#include "RandLAPACK.hh" +#include "rl_blaspp.hh" +#include "rl_lapackpp.hh" +#include "rl_gen.hh" + +#include +#include + +template +struct blas_benchmark_data { + int64_t dim; + std::vector A; + std::vector B; + std::vector C; + std::vector a; + std::vector b; + + blas_benchmark_data(int64_t n) : + A(n * n, 0.0), + B(n * n, 0.0), + C(n * n, 0.0), + a(n * n, 0.0), + b(n * n, 0.0) + { + dim = n; + } +}; + +// Re-generate and clear data +template +static void data_regen(RandLAPACK::gen::mat_gen_info m_info, + blas_benchmark_data &all_data, + RandBLAS::RNGState &state, + int alg_type) { + + auto state_const = state; + switch (alg_type) { + case 3: { + RandLAPACK::gen::mat_gen(m_info, all_data.A.data(), state_const); + state_const = state; + RandLAPACK::gen::mat_gen(m_info, all_data.B.data(), state_const); + std::fill(all_data.C.begin(), all_data.C.end(), 0.0); + break; + } + case 2: { + RandBLAS::DenseDist D1(1, m_info.cols); + RandBLAS::fill_dense(D1, all_data.a.data(), state_const).second; + state_const = state; + RandBLAS::fill_dense(D1, all_data.b.data(), state_const).second; + state_const = state; + RandLAPACK::gen::mat_gen(m_info, all_data.A.data(), state_const); + break; + } + case 1: { + RandBLAS::DenseDist D2(1, m_info.cols); + RandBLAS::fill_dense(D2, all_data.a.data(), state_const).second; + state_const = state; + RandBLAS::fill_dense(D2, all_data.b.data(), state_const).second; + break; + } + default: { + break; + } + } +} + +template +static void call_all_algs( + RandLAPACK::gen::mat_gen_info m_info, + int64_t numruns, + int64_t n, + blas_benchmark_data &all_data, + RandBLAS::RNGState &state, + std::string output_filename) { + + // timing vars + long dur_blas1 = 0; + long dur_blas2 = 0; + long dur_blas3 = 0; + + // Making sure the states are unchanged + auto state_gen = state; + + for (int i = 0; i < numruns; ++i) { + printf("ITERATION %d, DIM %ld\n", i, n); + // Testing BLAS3 + auto start_blas3 = high_resolution_clock::now(); + blas::gemm(Layout::ColMajor, Op::NoTrans, Op::NoTrans, n, n, n, 1.0, all_data.A.data(), n, all_data.B.data(), n, 0.0, all_data.C.data(), n); + auto stop_blas3 = high_resolution_clock::now(); + dur_blas3 = duration_cast(stop_blas3 - start_blas3).count(); + + state_gen = state; + data_regen(m_info, all_data, state_gen, 3); + + // Testing BLAS2 + auto start_blas2 = high_resolution_clock::now(); + blas::gemv(Layout::ColMajor, Op::NoTrans, n, n, 1.0, all_data.A.data(), n, all_data.a.data(), 1, 1.0, all_data.b.data(), 1); + auto stop_blas2 = high_resolution_clock::now(); + dur_blas2 = duration_cast(stop_blas2 - start_blas2).count(); + + state_gen = state; + data_regen(m_info, all_data, state_gen, 2); + + // Testing BLAS1 + auto start_blas1 = high_resolution_clock::now(); + blas::axpy(n, -1.0, all_data.a.data(), 1, all_data.b.data(), 1); + auto stop_blas1 = high_resolution_clock::now(); + dur_blas1 = duration_cast(stop_blas1 - start_blas1).count(); + + state_gen = state; + data_regen(m_info, all_data, state_gen, 2); + + std::ofstream file(output_filename, std::ios::app); + file << n << ", " << dur_blas1 << ", " << dur_blas2 << ", " << dur_blas3 << ",\n"; + } +} + +int main() { + // Declare parameters + int64_t n_start = std::pow(2, 10); + int64_t n_stop = std::pow(2, 14); + auto state = RandBLAS::RNGState(); + auto state_constant = state; + // Timing results + std::vector res; + // Number of algorithm runs. We only record best times. + int64_t numruns = 5; + + // Allocate basic workspace + blas_benchmark_data all_data(n_stop); + // Generate the input matrix - gaussian suffices for performance tests. + RandLAPACK::gen::mat_gen_info m_info(n_stop, n_stop, RandLAPACK::gen::gaussian); + data_regen(m_info, all_data, state, 3); + data_regen(m_info, all_data, state, 2); + + // Declare a data file + std::string output_filename = "BLAS_performance_comp_col_start_" + + std::to_string(n_start) + + "_col_stop_" + std::to_string(n_stop) + + ".dat"; + + for (;n_start <= n_stop; n_start *= 2) { + call_all_algs(m_info, numruns, n_start, all_data, state_constant, output_filename); + } +} diff --git a/test/drivers/bench_cqrrp_gpu.cu b/test/drivers/bench_cqrrp_gpu.cu index fc535b8f..67959de6 100644 --- a/test/drivers/bench_cqrrp_gpu.cu +++ b/test/drivers/bench_cqrrp_gpu.cu @@ -83,50 +83,70 @@ class BenchCQRRP : public ::testing::TestWithParam bool profile_runtime, bool run_qrf, RandLAPACK::gen::mat_gen_info m_info, - int64_t d_factor, T tol, int64_t block_size, CQRRPBenchData &all_data, RandBLAS::RNGState state, - std::string output_filename_breakdown, + std::string output_filename_breakdown_QRF, + std::string output_filename_breakdown_CholQR, std::string output_filename_speed) { + T d_factor = 1.0; auto m = all_data.row; auto n = all_data.col; auto state_const = state; - auto d = d_factor * block_size; + int64_t d = d_factor * block_size; + // ICQRRP with QRF // Skethcing in an sampling regime cudaMalloc(&all_data.A_sk_device, d * n * sizeof(T)); - all_data.A_sk = ( T * ) calloc( d * n, sizeof( T ) ); - T* S = ( T * ) calloc( d * m, sizeof( T ) ); + all_data.A_sk = ( T * ) calloc( d * n, sizeof( T ) ); + T* S = ( T * ) calloc( d * m, sizeof( T ) ); RandBLAS::DenseDist D(d, m); RandBLAS::fill_dense(D, S, state_const).second; blas::gemm(Layout::ColMajor, Op::NoTrans, Op::NoTrans, d, n, m, 1.0, S, d, all_data.A.data(), m, 0.0, all_data.A_sk, d); - free(S); cudaMemcpy(all_data.A_sk_device, all_data.A_sk, d * n * sizeof(double), cudaMemcpyHostToDevice); - - RandLAPACK::CQRRP_blocked_GPU CQRRP_GPU(profile_runtime, tol, block_size); - //CQRRP_GPU.use_qrf = true; - auto start = std::chrono::steady_clock::now(); - CQRRP_GPU.call(m, n, all_data.A_device, m, all_data.A_sk_device, d, all_data.tau_device, all_data.J_device); - auto stop = std::chrono::steady_clock::now(); - auto diff = std::chrono::duration_cast(stop - start).count(); - auto rank = CQRRP_GPU.rank; - //printf("RANK AS RETURNED BY CQRRP GPU %4ld\n", rank); + RandLAPACK::CQRRP_blocked_GPU CQRRP_GPU_QRF(profile_runtime, tol, block_size); + CQRRP_GPU_QRF.use_qrf = true; + auto start_icqrrp_qrf = std::chrono::steady_clock::now(); + CQRRP_GPU_QRF.call(m, n, all_data.A_device, m, all_data.A_sk_device, d, all_data.tau_device, all_data.J_device); + auto stop_icqrrp_qrf = std::chrono::steady_clock::now(); + auto diff_icqrrp_qrf = std::chrono::duration_cast(stop_icqrrp_qrf - start_icqrrp_qrf).count(); data_regen(m_info, all_data, state); cudaFree(all_data.A_sk_device); free(all_data.A_sk); - printf(" BLOCK SIZE = %ld TIME (MS) = %ld\n", block_size, diff); - std::ofstream file(output_filename_speed, std::ios::app); - file << m << " " << n << " " << block_size << " " << diff << "\n"; + if(profile_runtime) { + std::ofstream file(output_filename_breakdown_QRF, std::ios::app); + std::copy(CQRRP_GPU_QRF.times.data(), CQRRP_GPU_QRF.times.data() + 18, std::ostream_iterator(file, ", ")); + file << "\n"; + } + + // ICQRRP with CholQR + // Skethcing in an sampling regime + cudaMalloc(&all_data.A_sk_device, d * n * sizeof(T)); + all_data.A_sk = ( T * ) calloc( d * n, sizeof( T ) ); + blas::gemm(Layout::ColMajor, Op::NoTrans, Op::NoTrans, d, n, m, 1.0, S, d, all_data.A.data(), m, 0.0, all_data.A_sk, d); + free(S); + cudaMemcpy(all_data.A_sk_device, all_data.A_sk, d * n * sizeof(double), cudaMemcpyHostToDevice); + RandLAPACK::CQRRP_blocked_GPU CQRRP_GPU_CholQR(profile_runtime, tol, block_size); + CQRRP_GPU_CholQR.use_qrf = false; + auto start_icqrrp_cholqr = std::chrono::steady_clock::now(); + CQRRP_GPU_CholQR.call(m, n, all_data.A_device, m, all_data.A_sk_device, d, all_data.tau_device, all_data.J_device); + auto stop_icqrrp_cholqr = std::chrono::steady_clock::now(); + auto diff_icqrrp_cholqr = std::chrono::duration_cast(stop_icqrrp_cholqr - start_icqrrp_cholqr).count(); + data_regen(m_info, all_data, state); + cudaFree(all_data.A_sk_device); + free(all_data.A_sk); if(profile_runtime) { - std::ofstream file(output_filename_breakdown, std::ios::app); - std::copy(CQRRP_GPU.times.data(), CQRRP_GPU.times.data() + 17, std::ostream_iterator(file, ", ")); + std::ofstream file(output_filename_breakdown_CholQR, std::ios::app); + std::copy(CQRRP_GPU_CholQR.times.data(), CQRRP_GPU_CholQR.times.data() + 18, std::ostream_iterator(file, ", ")); file << "\n"; } + + // Optional QRF + long diff_qrf = 0; if (run_qrf) { lapack::Queue lapack_queue(0); using lapack::device_info_int; @@ -143,21 +163,32 @@ class BenchCQRRP : public ::testing::TestWithParam lapack::geqrf(m, n, all_data.A_device, m, all_data.tau_device, d_work_geqrf, d_size_geqrf, h_work_geqrf, h_size_geqrf, d_info, lapack_queue); lapack_queue.sync(); auto stop_qrf = std::chrono::steady_clock::now(); - auto diff_qrf = std::chrono::duration_cast(stop_qrf - start_qrf).count(); + diff_qrf = std::chrono::duration_cast(stop_qrf - start_qrf).count(); printf(" QRF TIME (MS) = %ld\n", diff_qrf); } + + printf(" BLOCK SIZE = %ld ICQRRP+QRF TIME (MS) = %ld ICQRRP+CholQR TIME (MS) = %ld\n", block_size, diff_icqrrp_qrf, diff_icqrrp_cholqr); + std::ofstream file(output_filename_speed, std::ios::app); + file << m << " " << n << " " << block_size << " " << diff_icqrrp_qrf << " " << diff_icqrrp_cholqr << " " << diff_qrf << "\n"; + cudaError_t ierr = cudaGetLastError(); + if (ierr != cudaSuccess) + { + RandLAPACK_CUDA_ERROR("Error before bench_CQRRP returned. " << cudaGetErrorString(ierr)) + abort(); + } } + // Not using this right now. But there's no harm in keeping it around. template static void bench_CholQR( RandLAPACK::gen::mat_gen_info m_info, - int64_t numcols, + int64_t numrows, CQRRPBenchData &all_data, RandBLAS::RNGState state, std::string output_filename) { - auto m = all_data.row; - auto n = numcols; + auto m = numrows; + auto n = all_data.col; auto state_const = state; // Initialize GPU stuff @@ -176,7 +207,7 @@ class BenchCQRRP : public ::testing::TestWithParam blas::trsm(Layout::ColMajor, Side::Right, Uplo::Upper, Op::NoTrans, Diag::NonUnit, m, n, (T) 1.0, all_data.R_device, n, all_data.A_device, m, lapack_queue); lapack_queue.sync(); auto stop_cholqr = std::chrono::steady_clock::now(); - auto diff_cholqr = std::chrono::duration_cast(stop_cholqr - start_cholqr).count(); + auto diff_cholqr = std::chrono::duration_cast(stop_cholqr - start_cholqr).count(); auto start_orhr_col = std::chrono::steady_clock::now(); // ORHR_COL part @@ -184,7 +215,7 @@ class BenchCQRRP : public ::testing::TestWithParam RandLAPACK::cuda_kernels::R_cholqr_signs_gpu(strm, n, n, all_data.R_device, all_data.D_device); cudaStreamSynchronize(strm); auto stop_orhr_col = std::chrono::steady_clock::now(); - auto diff_orhr_col = std::chrono::duration_cast(stop_orhr_col - start_orhr_col).count(); + auto diff_orhr_col = std::chrono::duration_cast(stop_orhr_col - start_orhr_col).count(); // Mandatory data re-generation data_regen(m_info, all_data, state); @@ -198,26 +229,32 @@ class BenchCQRRP : public ::testing::TestWithParam lapack::geqrf(m, n, all_data.A_device, m, all_data.tau_device, d_work_geqrf, d_size_geqrf, h_work_geqrf, h_size_geqrf, d_info, lapack_queue); lapack_queue.sync(); auto stop_qrf = std::chrono::steady_clock::now(); - auto diff_qrf = std::chrono::duration_cast(stop_qrf - start_qrf).count(); + auto diff_qrf = std::chrono::duration_cast(stop_qrf - start_qrf).count(); printf(" CholQR TIME (MS) = %ld\n", diff_cholqr); printf(" ORHR_COL TIME (MS) = %ld\n", diff_orhr_col); printf(" QRF TIME (MS) = %ld\n", diff_qrf); std::ofstream file(output_filename, std::ios::app); file << m << " " << n << " " << diff_cholqr << " " << diff_orhr_col << " " << diff_qrf << "\n"; - } + cudaError_t ierr = cudaGetLastError(); + if (ierr != cudaSuccess) + { + RandLAPACK_CUDA_ERROR("Error before bench_CholQR returned. " << cudaGetErrorString(ierr)) + abort(); + } + } }; -TEST_P(BenchCQRRP, CQRRP_GPU_benchmark_16k) { - int64_t m = std::pow(2, 14); - int64_t n = std::pow(2, 14); - double d_factor = 1.25; + +TEST_P(BenchCQRRP, GPU_fixed_blocksize) { + int64_t m = std::pow(2, 15); + int64_t n = std::pow(2, 15); int64_t b_sz = GetParam(); double tol = std::pow(std::numeric_limits::epsilon(), 0.85); auto state = RandBLAS::RNGState(); bool profile_runtime = true; bool run_qrf = false; - if(b_sz == 120) { + if(b_sz == 128) { run_qrf = true; } @@ -227,45 +264,27 @@ TEST_P(BenchCQRRP, CQRRP_GPU_benchmark_16k) { cudaMemcpy(all_data.A_device, all_data.A.data(), m * n * sizeof(double), cudaMemcpyHostToDevice); - std::string file1 = "ICQRRP_GPU_runtime_breakdown_rows_" + std::to_string(m) - + "_cols_" + std::to_string(n) - + "_d_factor_" + std::to_string(d_factor) - + ".dat"; + std::string file1 = "ICQRRP_GPU_runtime_breakdown_innerQRF_1_rows_" + + std::to_string(m) + + "_cols_" + std::to_string(n) + + "_d_factor_1.0.dat"; + + std::string file2 = "ICQRRP_GPU_runtime_breakdown_innerQRF_0_rows_" + + std::to_string(m) + + "_cols_" + std::to_string(n) + + "_d_factor_1.0.dat"; - std::string file2 = "ICQRRP_GPU_speed_rows_" + std::to_string(m) - + "_cols_" + std::to_string(n) - + "_d_factor_" + std::to_string(d_factor) - + ".dat"; + std::string file3 = "ICQRRP_GPU_speed_rows_" + + std::to_string(m) + + "_cols_" + std::to_string(n) + + "_d_factor_1.0.dat"; - bench_CQRRP(profile_runtime, run_qrf, m_info, d_factor, tol, b_sz, all_data, state, file1, file2); + bench_CQRRP(profile_runtime, run_qrf, m_info, tol, b_sz, all_data, state, file1, file2, file3); } INSTANTIATE_TEST_SUITE_P( - CQRRP_GPU_16k_benchmarks, + CQRRP_GPU_benchmarks, BenchCQRRP, - ::testing::Values(32, 40, 48, 56, 64, 72, 80, 88, 96, 104, 112, 120, 128, 136, 144, 152, 160, 168, 176, - 184, 192, 200, 208, 216, 224, 232, 240, 248, 256, 264, 272, 280, 288, 296, 304, 312, 320, 328, 336, 344, - 352, 360, 368, 376, 384, 392, 400, 408, 416, 424, 432, 440, 448, 456, 464, 472, 480, 488, 496, 504, 512) + ::testing::Values(32, 64, 96, 128, 160, 192, 224, 256, 288, 320, 352, 384, 416, 448, 480, 512, 640, 768, 896, 1024, 1152, 1280, 1408, 1536, 1664, 1792, 1920, 2048) ); - -TEST_F(BenchCQRRP, Bench_CholQR) { - int64_t m = std::pow(2, 14); - int64_t n_start = 120; - int64_t n_stop = std::pow(2, 14); - auto state = RandBLAS::RNGState(); - - CQRRPBenchData all_data(m, n_stop); - RandLAPACK::gen::mat_gen_info m_info(m, n_stop, RandLAPACK::gen::gaussian); - RandLAPACK::gen::mat_gen(m_info, all_data.A.data(), state); - cudaMemcpy(all_data.A_device, all_data.A.data(), m * n_stop * sizeof(double), cudaMemcpyHostToDevice); - - - std::string file = "CholQR_GPU_speed_rows_" + std::to_string(m) - + "_cols_start_" + std::to_string(n_start) - + "_cols_stop_" + std::to_string(n_stop) - + ".dat"; - - for(int i = n_start; i <= n_stop; i += n_start) - bench_CholQR(m_info, i, all_data, state, file); -} #endif diff --git a/test/drivers/test_cqrrp.cc b/test/drivers/test_cqrrp.cc index e724f600..28ae7b3a 100644 --- a/test/drivers/test_cqrrp.cc +++ b/test/drivers/test_cqrrp.cc @@ -143,7 +143,6 @@ class TestCQRRP : public ::testing::Test error_check(norm_A, all_data, atol); } } - }; #if !defined(__APPLE__) @@ -286,11 +285,11 @@ TEST_F(TestCQRRP, CQRRP_blocked_near_zero_input_qp3) { // Note: If Subprocess killed exception -> reload vscode TEST_F(TestCQRRP, CQRRP_blocked_near_zero_luqr) { - int64_t m = 1000;//5000; - int64_t n = 1000;//2000; + int64_t m = 1000; + int64_t n = 1000; int64_t k = 1000; double d_factor = 1;//1.0; - int64_t b_sz = 100;//500; + int64_t b_sz = 100; double norm_A = 0; double tol = std::pow(std::numeric_limits::epsilon(), 0.85); auto state = RandBLAS::RNGState(); @@ -299,7 +298,8 @@ TEST_F(TestCQRRP, CQRRP_blocked_near_zero_luqr) { RandLAPACK::CQRRP_blocked CQRRP_blocked(true, tol, b_sz); std::fill(&(all_data.A.data())[0], &(all_data.A.data())[m * n], 0.0); - all_data.A[1000*200 + 10] = 1; + //all_data.A[1000*200 + 10] = 1; + all_data.A[10*5 + 1] = 1; norm_and_copy_computational_helper(norm_A, all_data); test_CQRRP_general(d_factor, norm_A, all_data, CQRRP_blocked, state); @@ -346,4 +346,26 @@ TEST_F(TestCQRRP, CQRRP_blocked_zero_mat) { norm_and_copy_computational_helper(norm_A, all_data); test_CQRRP_general(d_factor, norm_A, all_data, CQRRP_blocked, state); } + +TEST_F(TestCQRRP, CQRRP_blocked_qrf) { + int64_t m = 5000;//5000; + int64_t n = 2800;//2000; + int64_t k = 2800; + double d_factor = 1;//1.0; + int64_t b_sz = 900;//500; + double norm_A = 0; + double tol = std::pow(std::numeric_limits::epsilon(), 0.85); + auto state = RandBLAS::RNGState(); + + CQRRPTestData all_data(m, n, k); + RandLAPACK::CQRRP_blocked CQRRP_blocked(true, tol, b_sz); + CQRRP_blocked.use_qrf = true; + CQRRP_blocked.internal_nb = 10; + + RandLAPACK::gen::mat_gen_info m_info(m, n, RandLAPACK::gen::gaussian); + RandLAPACK::gen::mat_gen(m_info, all_data.A.data(), state); + + norm_and_copy_computational_helper(norm_A, all_data); + test_CQRRP_general(d_factor, norm_A, all_data, CQRRP_blocked, state); +} #endif diff --git a/test/drivers/test_cqrrp_gpu.cu b/test/drivers/test_cqrrp_gpu.cu index 3778a1a5..fd73553c 100644 --- a/test/drivers/test_cqrrp_gpu.cu +++ b/test/drivers/test_cqrrp_gpu.cu @@ -194,6 +194,7 @@ class TestCQRRP : public ::testing::TestWithParam error_check(norm_A, all_data, atol); } + cudaError_t ierr = cudaGetLastError(); if (ierr != cudaSuccess) { @@ -298,11 +299,11 @@ TEST_F(TestCQRRP, CQRRP_GPU_vectors) { // Note: If Subprocess killed exception -> reload vscode TEST_F(TestCQRRP, CQRRP_GPU_near_zero_input) { - int64_t m = 1000;//5000; - int64_t n = 1000;//2000; + int64_t m = 1000; + int64_t n = 1000; int64_t k = 1000; - double d_factor = 1;//1.0; - int64_t b_sz = 100;//500; + double d_factor = 1; + int64_t b_sz = 100; int64_t d = d_factor * b_sz; double norm_A = 0; double tol = std::pow(std::numeric_limits::epsilon(), 0.85);