diff --git a/RandLAPACK/comps/rl_orth.hh b/RandLAPACK/comps/rl_orth.hh index 8b9c6d44..573f9a6b 100644 --- a/RandLAPACK/comps/rl_orth.hh +++ b/RandLAPACK/comps/rl_orth.hh @@ -238,7 +238,7 @@ int PLUL::call( if(lapack::getrf(m, n, A_dat, m, ipiv_dat)) return 1; // failure condition - util::get_L(m, n, A, 1); + util::get_L(m, n, A_dat, 1); lapack::laswp(n, A_dat, m, 1, n, ipiv_dat, 1); return 0; diff --git a/RandLAPACK/drivers/rl_cqrrpt.hh b/RandLAPACK/drivers/rl_cqrrpt.hh index 8a31e8ff..77406edd 100644 --- a/RandLAPACK/drivers/rl_cqrrpt.hh +++ b/RandLAPACK/drivers/rl_cqrrpt.hh @@ -25,10 +25,12 @@ class CQRRPTalg { virtual int call( int64_t m, int64_t n, - std::vector &A, - int64_t d, - std::vector &R, - std::vector &J, + T* A, + int64_t lda, + T* R, + int64_t ldr, + int64_t* J, + T d_factor, RandBLAS::RNGState &state ) = 0; }; @@ -66,9 +68,6 @@ class CQRRPT : public CQRRPTalg { oversampling = 10; use_cholqr = 0; panel_pivoting = 1; - naive_rank_estimate = 1; - use_fro_norm = 1; - cond_check = 0; } /// Computes a QR factorization with column pivots of the form: @@ -112,46 +111,34 @@ class CQRRPT : public CQRRPTalg { int call( int64_t m, int64_t n, - std::vector &A, - int64_t d, - std::vector &R, - std::vector &J, + T* A, + int64_t lda, + T* R, + int64_t ldr, + int64_t* J, + T d_factor, RandBLAS::RNGState &state ) override; public: bool verbosity; bool timing; - bool cond_check; T eps; int64_t rank; - // 10 entries + // 8 entries std::vector times; // tuning SASOS int num_threads; int64_t nnz; - // Buffers - std::vector A_hat; - std::vector tau; - std::vector R_sp; - // HQRRP-related int no_hqrrp; int64_t nb_alg; int64_t oversampling; int64_t panel_pivoting; int64_t use_cholqr; - - // Rank estimate-related - int naive_rank_estimate; - int use_fro_norm; - - // Preconditioning-related - T cond_num_A_pre; - T cond_num_A_norm_pre; }; // ----------------------------------------------------------------------------- @@ -159,73 +146,66 @@ template int CQRRPT::call( int64_t m, int64_t n, - std::vector &A, - int64_t d, - std::vector &R, - std::vector &J, + T* A, + int64_t lda, + T* R, + int64_t ldr, + int64_t* J, + T d_factor, RandBLAS::RNGState &state ){ - //-------TIMING VARS--------/ + ///--------------------TIMING VARS--------------------/ high_resolution_clock::time_point saso_t_stop; high_resolution_clock::time_point saso_t_start; - long saso_t_dur = 0; - high_resolution_clock::time_point qrcp_t_start; high_resolution_clock::time_point qrcp_t_stop; - long qrcp_t_dur = 0; - high_resolution_clock::time_point rank_reveal_t_start; high_resolution_clock::time_point rank_reveal_t_stop; - long rank_reveal_t_dur = 0; - high_resolution_clock::time_point cholqr_t_start; high_resolution_clock::time_point cholqr_t_stop; - long cholqr_t_dur = 0; - high_resolution_clock::time_point a_mod_piv_t_start; high_resolution_clock::time_point a_mod_piv_t_stop; - long a_mod_piv_t_dur = 0; - high_resolution_clock::time_point a_mod_trsm_t_start; high_resolution_clock::time_point a_mod_trsm_t_stop; - long a_mod_trsm_t_dur = 0; - - high_resolution_clock::time_point copy_t_start; - high_resolution_clock::time_point copy_t_stop; - long copy_t_dur = 0; - - high_resolution_clock::time_point resize_t_start; - high_resolution_clock::time_point resize_t_stop; - long resize_t_dur = 0; - high_resolution_clock::time_point total_t_start; high_resolution_clock::time_point total_t_stop; - long total_t_dur = 0; + long saso_t_dur = 0; + long qrcp_t_dur = 0; + long rank_reveal_t_dur = 0; + long cholqr_t_dur = 0; + long a_mod_piv_t_dur = 0; + long a_mod_trsm_t_dur = 0; + long total_t_dur = 0; - if(this -> timing) { + if(this -> timing) total_t_start = high_resolution_clock::now(); - resize_t_start = high_resolution_clock::now(); - } - T* A_dat = A.data(); - T* A_hat_dat = util::upsize(d * n, this->A_hat); - T* tau_dat = util::upsize(n, this->tau); - J.resize(n); - int64_t* J_dat = J.data(); + int i; + int64_t k = n; + int64_t d = d_factor * n; + // A constant for initial rank estimation. + T eps_initial_rank_estimation = 2 * std::pow(std::numeric_limits::epsilon(), 0.95); + // Variables for a posteriori rank estimation. + int64_t new_rank; + T running_max, running_min, curr_entry; - if(this -> timing) { - resize_t_stop = high_resolution_clock::now(); - resize_t_dur = duration_cast(resize_t_stop - resize_t_start).count(); + T* A_hat = ( T * ) calloc( d * n, sizeof( T ) ); + T* tau = ( T * ) calloc( n, sizeof( T ) ); + // Buffer for column pivoting. + std::vector J_buf(n, 0); + + if(this -> timing) saso_t_start = high_resolution_clock::now(); - } + /// Generating a SASO RandBLAS::SparseDist DS = {.n_rows = d, .n_cols = m, .vec_nnz = this->nnz}; RandBLAS::SparseSkOp S(DS, state); state = RandBLAS::fill_sparse(S); + /// Applying a SASO RandBLAS::sketch_general( Layout::ColMajor, Op::NoTrans, Op::NoTrans, - d, n, m, 1.0, S, 0, 0, A.data(), m, 0.0, A_hat_dat, d + d, n, m, 1.0, S, 0, 0, A, lda, 0.0, A_hat, d ); if(this -> timing) { @@ -233,103 +213,48 @@ int CQRRPT::call( qrcp_t_start = high_resolution_clock::now(); } - // QRCP - add failure condition + /// Performing QRCP on a sketch if(this->no_hqrrp) { - lapack::geqp3(d, n, A_hat_dat, d, J_dat, tau_dat); - } - else { - std::iota(J.begin(), J.end(), 1); - hqrrp(d, n, A_hat_dat, d, J_dat, tau_dat, this->nb_alg, this->oversampling, this->panel_pivoting, this->use_cholqr, state, (T*) nullptr); + lapack::geqp3(d, n, A_hat, d, J, tau); + } else { + std::iota(J, &J[n], 1); + hqrrp(d, n, A_hat, d, J, tau, this->nb_alg, this->oversampling, this->panel_pivoting, this->use_cholqr, state, (T*) nullptr); } if(this -> timing) { qrcp_t_stop = high_resolution_clock::now(); - resize_t_start = high_resolution_clock::now(); - } - - T* R_dat = util::upsize(n * n, R); - - if(this -> timing) { - resize_t_stop = high_resolution_clock::now(); - resize_t_dur += duration_cast(resize_t_stop - resize_t_start).count(); rank_reveal_t_start = high_resolution_clock::now(); } - int64_t k = n; - int i; - T eps_initial_rank_estimation = 2 * std::pow(std::numeric_limits::epsilon(), 0.95); - if(this->naive_rank_estimate) { - /// Using R[i,i] to approximate the i-th singular value of A_hat. - /// Truncate at the largest i where R[i,i] / R[0,0] >= eps. - for(i = 0; i < n; ++i) { - if(std::abs(A_hat_dat[i * d + i]) / std::abs(A_hat_dat[0]) < eps_initial_rank_estimation) { - k = i; - break; - } - } - this->rank = k; - } - else { - // Oleg's scheme for rank estimation - for(i = 0; i < n; ++i) { - // copy over an upper-triangular matrix R - // from col-maj to row-maj format - blas::copy(i + 1, &A_hat_dat[i * d], 1, &R_dat[i], n); - } - - T norm_R = 0.0; - if(this->use_fro_norm) { - // find fro norm of the full R - norm_R = lapack::lange(Norm::Fro, n, n, R_dat, n); - } else { - // find l2 norm of the full R - norm_R = RandLAPACK::util::estimate_spectral_norm(n, n, R_dat, 10, state); - eps_initial_rank_estimation = 5 * eps_initial_rank_estimation; - } - - T norm_R_sub = lapack::lange(Norm::Fro, 1, n, &R_dat[(n - 1) * n], 1); - // Check if R is full column rank checking if||A[n - 1:, n - 1:]||_F > tau_trunk * ||A||_F - if ((norm_R_sub > eps_initial_rank_estimation * norm_R)) { - k = n; - } else { - k = RandLAPACK::util::rank_search_binary(0, n + 1, std::floor(n / 2), n, norm_R, eps_initial_rank_estimation, R_dat); + /// Using naive rank estimation to ensure that R used for preconditioning is invertible. + /// The actual rank estimate k will be computed a posteriori. + /// Using R[i,i] to approximate the i-th singular value of A_hat. + /// Truncate at the largest i where R[i,i] / R[0,0] >= eps. + for(i = 0; i < n; ++i) { + if(std::abs(A_hat[i * d + i]) / std::abs(A_hat[0]) < eps_initial_rank_estimation) { + k = i; + break; } - - this->rank = k; - // Clear R - std::fill(R.begin(), R.end(), 0.0); } + this->rank = k; - if(this -> timing) { + if(this -> timing) rank_reveal_t_stop = high_resolution_clock::now(); - resize_t_start = high_resolution_clock::now(); - } - T* R_sp_dat = util::upsize(k * k, this->R_sp); + // Allocating space for a preconditioner buffer. + T* R_sp = ( T * ) calloc( k * k, sizeof( T ) ); + /// Extracting a k by k upper-triangular R. + lapack::lacpy(MatrixType::Upper, k, k, A_hat, d, R_sp, k); + /// Extracting a k by n R representation (k by k upper-triangular, rest - general) + lapack::lacpy(MatrixType::Upper, k, k, A_hat, d, R, ldr); + lapack::lacpy(MatrixType::General, k, n - k, &A_hat[d * k], d, &R[n * k], ldr); - if(this -> timing) { - resize_t_stop = high_resolution_clock::now(); - copy_t_start = high_resolution_clock::now(); - } - - // performing a copy column by column - for(i = 0; i < k; ++i) { - // extract k by k R - blas::copy(i + 1, &A_hat_dat[i * d], 1, &R_sp_dat[i * k], 1); - // extract full R - blas::copy(i + 1, &A_hat_dat[i * d], 1, &R_dat[i * k], 1); - } - for(i = k; i < n; ++i) { - blas::copy(k, &A_hat_dat[i * d], 1, &R_dat[i * k], 1); - } - - if(this -> timing) { - copy_t_stop = high_resolution_clock::now(); + if(this -> timing) a_mod_piv_t_start = high_resolution_clock::now(); - } // Swap k columns of A with pivots from J - util::col_swap(m, n, k, A_dat, m, J); + blas::copy(n, J, 1, J_buf.data(), 1); + util::col_swap(m, n, k, A, lda, J_buf); if(this -> timing) { a_mod_piv_t_stop = high_resolution_clock::now(); @@ -337,93 +262,67 @@ int CQRRPT::call( } // A_pre * R_sp = AP - blas::trsm(Layout::ColMajor, Side::Right, Uplo::Upper, Op::NoTrans, Diag::NonUnit, m, k, 1.0, R_sp_dat, k, A_dat, m); + blas::trsm(Layout::ColMajor, Side::Right, Uplo::Upper, Op::NoTrans, Diag::NonUnit, m, k, 1.0, R_sp, k, A, lda); - if(this -> timing) + if(this -> timing) { a_mod_trsm_t_stop = high_resolution_clock::now(); - - // Check the condition number of a A_pre - if(this -> cond_check) - { - // Check cond(A_pre) - std::vector A_pre_cpy; - std::vector s; - this->cond_num_A_pre = RandLAPACK::util::cond_num_check(m, k, A, A_pre_cpy, s, false); - - A_pre_cpy.clear(); - s.clear(); - // Check cond(normc(A_pre)) - std::vector A_norm_pre; - RandLAPACK::util::normc(m, k, A, A_norm_pre); - this->cond_num_A_norm_pre = RandLAPACK::util::cond_num_check(m, k, A_norm_pre, A_pre_cpy, s, false); - } - - if(this -> timing) cholqr_t_start = high_resolution_clock::now(); + } // Do Cholesky QR - blas::syrk(Layout::ColMajor, Uplo::Upper, Op::Trans, k, m, 1.0, A_dat, m, 0.0, R_sp_dat, k); - if(lapack::potrf(Uplo::Upper, k, R_sp_dat, k)){ - if(this->verbosity) - throw std::runtime_error("Cholesky decomposition failed."); - return 1; - } + blas::syrk(Layout::ColMajor, Uplo::Upper, Op::Trans, k, m, 1.0, A, lda, 0.0, R_sp, k); + lapack::potrf(Uplo::Upper, k, R_sp, k); // Re-estimate rank after we have the R-factor form Cholesky QR. // The strategy here is the same as in naive rank estimation. // This also automatically takes care of any potentical failures in Cholesky factorization. - // Note that the diagonal of R_sp_dat may not be sorted, so we need to keep the running max/min - // We expect the loss in the orthogonality of Q to be approximately equal to u * cond(R_sp_dat)^2, where u is the unit roundoff for the numerical type T. - int64_t new_rank = k; - T running_max = R_sp_dat[0]; - T running_min = R_sp_dat[0]; - T curr_entry; + // Note that the diagonal of R_sp may not be sorted, so we need to keep the running max/min + // We expect the loss in the orthogonality of Q to be approximately equal to u * cond(R_sp)^2, where u is the unit roundoff for the numerical type T. + new_rank = k; + running_max = R_sp[0]; + running_min = R_sp[0]; for(i = 0; i < k; ++i) { curr_entry = std::abs(R_sp[i * k + i]); - - if(curr_entry > running_max) running_max = curr_entry; - if(curr_entry < running_min) running_max = running_min; - + running_max = std::max(running_max, curr_entry); + running_min = std::min(running_min, curr_entry); if(running_max / running_min >= std::sqrt(this->eps / std::numeric_limits::epsilon())) { new_rank = i - 1; break; } } - // Beware of that R_sp and R have k rows and need to be downsized by rows - RandLAPACK::util::row_resize(k, k, R_sp, new_rank); - RandLAPACK::util::row_resize(k, n, R, new_rank); - - k = new_rank; - this->rank = k; - - blas::trsm(Layout::ColMajor, Side::Right, Uplo::Upper, Op::NoTrans, Diag::NonUnit, m, k, 1.0, R_sp_dat, k, A_dat, m); + blas::trsm(Layout::ColMajor, Side::Right, Uplo::Upper, Op::NoTrans, Diag::NonUnit, m, new_rank, 1.0, R_sp, k, A, lda); if(this -> timing) cholqr_t_stop = high_resolution_clock::now(); - // Get R - // trmm - blas::trmm(Layout::ColMajor, Side::Left, Uplo::Upper, Op::NoTrans, Diag::NonUnit, k, n, 1.0, R_sp_dat, k, R_dat, k); + // Get the final R-factor. + blas::trmm(Layout::ColMajor, Side::Left, Uplo::Upper, Op::NoTrans, Diag::NonUnit, new_rank, n, 1.0, R_sp, k, R, ldr); + + // Set the rank parameter to the value comuted a posteriori. + this->rank = k; if(this -> timing) { saso_t_dur = duration_cast(saso_t_stop - saso_t_start).count(); qrcp_t_dur = duration_cast(qrcp_t_stop - qrcp_t_start).count(); rank_reveal_t_dur = duration_cast(rank_reveal_t_stop - rank_reveal_t_start).count(); - resize_t_dur += duration_cast(resize_t_stop - resize_t_start).count(); - copy_t_dur += duration_cast(copy_t_stop - copy_t_start).count(); a_mod_piv_t_dur = duration_cast(a_mod_piv_t_stop - a_mod_piv_t_start).count(); a_mod_trsm_t_dur = duration_cast(a_mod_trsm_t_stop - a_mod_trsm_t_start).count(); cholqr_t_dur = duration_cast(cholqr_t_stop - cholqr_t_start).count(); total_t_stop = high_resolution_clock::now(); total_t_dur = duration_cast(total_t_stop - total_t_start).count(); - long t_rest = total_t_dur - (saso_t_dur + qrcp_t_dur + rank_reveal_t_dur + cholqr_t_dur + a_mod_piv_t_dur + a_mod_trsm_t_dur + copy_t_dur + resize_t_dur); + long t_rest = total_t_dur - (saso_t_dur + qrcp_t_dur + rank_reveal_t_dur + cholqr_t_dur + a_mod_piv_t_dur + a_mod_trsm_t_dur); // Fill the data vector - this -> times = {saso_t_dur, qrcp_t_dur, rank_reveal_t_dur, cholqr_t_dur, a_mod_piv_t_dur, a_mod_trsm_t_dur, copy_t_dur, resize_t_dur, t_rest, total_t_dur}; + this -> times = {saso_t_dur, qrcp_t_dur, rank_reveal_t_dur, cholqr_t_dur, a_mod_piv_t_dur, a_mod_trsm_t_dur, t_rest, total_t_dur}; } + + free(A_hat); + free(R_sp); + free(tau); + return 0; } } // end namespace RandLAPACK diff --git a/RandLAPACK/misc/rl_util.hh b/RandLAPACK/misc/rl_util.hh index 55b59975..2fbcdf5c 100644 --- a/RandLAPACK/misc/rl_util.hh +++ b/RandLAPACK/misc/rl_util.hh @@ -53,23 +53,7 @@ void diag( blas::copy(k, s.data(), 1, S.data(), m + 1); } -/// Captures k diagonal elements of A and stores them in buf. -template -void extract_diag( - int64_t m, - int64_t n, - int64_t k, - const std::vector &A, - std::vector &buf -) { - if(k > std::min(m, n)) - throw std::runtime_error("Invalid rank parameter."); - for(int i = 0; i < k; ++i) - buf[i] = A[(i * m) + i]; -} - -/// Extracts the l-portion of the GETRF result, places 1's on the main diagonal. -/// Overwrites the passed-in matrix. +/// Zeros-out the upper-triangular portion of A template void get_L( int64_t m, @@ -85,35 +69,6 @@ void get_L( } } -template -void get_L( - int64_t m, - int64_t n, - std::vector &L, - int overwrite_diagonal -) { - get_L(m, n, L.data(), overwrite_diagonal); -} - -/// Stores the upper-triangualr portion of A in U. -template -void get_U( - int64_t m, - int64_t n, - const std::vector &A, - std::vector &U // We are assuming U is n by n -) { - // Vector end pointer - int size = m * n; - - const T* A_dat = A.data(); - T* U_dat = U.data(); - - for(int i = 0, j = 1, k = 0; i < size && j <= m; i += m, k +=n, ++j) { - blas::copy(j, &A_dat[i], 1, &U_dat[k], 1); - } -} - /// Zeros-out the lower-triangular portion of A template void get_U( diff --git a/benchmark/CHOLQR_vs_GEQRF.cc b/benchmark/CHOLQR_vs_GEQRF.cc deleted file mode 100644 index 28528e3a..00000000 --- a/benchmark/CHOLQR_vs_GEQRF.cc +++ /dev/null @@ -1,158 +0,0 @@ -#include "RandLAPACK.hh" -#include "rl_blaspp.hh" -#include "rl_lapackpp.hh" -#include "rl_gen.hh" - -#include -#include - -template -struct CHOLQR_vs_GEQRF_speed_benchmark_data { - int64_t row; - int64_t col; - std::vector A; - std::vector tau; - std::vector R; - std::vector T_mat; - std::vector D; - - CHOLQR_vs_GEQRF_speed_benchmark_data(int64_t m, int64_t n) : - A(m * n, 0.0), - tau(n, 0.0), - R(n * n, 0.0), - T_mat(n * n, 0.0), - D(n, 0.0) - { - row = m; - col = n; - } -}; - -// Re-generate and clear data -template -static void data_regen(RandLAPACK::gen::mat_gen_info m_info, - CHOLQR_vs_GEQRF_speed_benchmark_data &all_data, - RandBLAS::RNGState &state, int is_cholqr) { - - RandLAPACK::gen::mat_gen(m_info, all_data.A, state); - std::fill(all_data.tau.begin(), all_data.tau.end(), 0.0); - std::fill(all_data.T_mat.begin(), all_data.T_mat.end(), 0.0); - if (is_cholqr) { - std::fill(all_data.R.begin(), all_data.R.end(), 0.0); - std::fill(all_data.D.begin(), all_data.D.end(), 0.0); - } -} - -template -static std::vector call_all_algs( - int64_t rows, - RandLAPACK::gen::mat_gen_info m_info, - int64_t numruns, - CHOLQR_vs_GEQRF_speed_benchmark_data &all_data, - RandBLAS::RNGState &state) { - - auto m = all_data.row; - auto n = all_data.col; - - // timing vars - long dur_cholqr = 0; - long dur_geqrf = 0; - long t_cholqr_best = 0; - long t_geqrf_best = 0; - - T* R_dat = all_data.R.data(); - T* D_dat = all_data.D.data(); - T* T_dat = all_data.T_mat.data(); - T* tau_dat = all_data.tau.data(); - - for (int k = 0; k < numruns; ++k) { - // Testing cholqr - auto start_cholqr = high_resolution_clock::now(); - //----------------------------------------------------------------------------------------------------------------------------------------/ - // Find R = A^TA. - blas::syrk(Layout::ColMajor, Uplo::Upper, Op::Trans, n, rows, 1.0, all_data.A.data(), m, 0.0, all_data.R.data(), n); - // Perform Cholesky factorization on A. - lapack::potrf(Uplo::Upper, n, all_data.R.data(), n); - // Find Q = A * inv(R) - blas::trsm(Layout::ColMajor, Side::Right, Uplo::Upper, Op::NoTrans, Diag::NonUnit, rows, n, 1.0, all_data.R.data(), n, all_data.A.data(), m); - // Perform Householder reconstruction - lapack::orhr_col(rows, n, n, all_data.A.data(), rows, all_data.T_mat.data(), n, all_data.D.data()); - // Update the signs in the R-factor - int i, j; - for(i = 0; i < n; ++i) - for(j = 0; j < (i + 1); ++j) - R_dat[(n * i) + j] *= D_dat[j]; - - // Copy the R-factor into the upper-trianular portion of A - lapack::lacpy(MatrixType::Upper, n, n, all_data.R.data(), n, all_data.A.data(), m); - // Entries of tau will be placed on the main diagonal of matrix T from orhr_col(). - for(i = 0; i < n; ++i) - tau_dat[i] = T_dat[(n + 1) * i]; - //----------------------------------------------------------------------------------------------------------------------------------------/ - auto stop_cholqr = high_resolution_clock::now(); - dur_cholqr = duration_cast(stop_cholqr - start_cholqr).count(); - // Update best timing - k == 0 ? t_cholqr_best = dur_cholqr : (dur_cholqr < t_cholqr_best) ? t_cholqr_best = dur_cholqr : NULL; - - // Clear and re-generate data - data_regen(m_info, all_data, state, 0); - - // Testing GEQRF - auto start_geqrf = high_resolution_clock::now(); - lapack::geqrf(rows, n, all_data.A.data(), m, all_data.tau.data()); - lapack::larft( lapack::Direction::Forward, lapack::StoreV::Columnwise, rows, n, all_data.A.data(), m, all_data.tau.data(), all_data.T_mat.data(), n); - auto stop_geqrf = high_resolution_clock::now(); - dur_geqrf = duration_cast(stop_geqrf - start_geqrf).count(); - // Update best timing - k == 0 ? t_geqrf_best = dur_geqrf : (dur_geqrf < t_geqrf_best) ? t_geqrf_best = dur_geqrf : NULL; - - // Clear and re-generate data - data_regen(m_info, all_data, state, 1); - } - - printf("For %ld rows\n", rows); - printf("CHOLQR takes %ld μs\n", t_cholqr_best); - printf("GEQRF takes %ld μs\n\n", t_geqrf_best); - std::vector res{t_cholqr_best, t_geqrf_best}; - - return res; -} - -int main() { - // Declare parameters - int64_t rows_start = std::pow(2, 14); - int64_t rows_end = 256; - int64_t cols = 256; - - auto state = RandBLAS::RNGState(); - auto state_constant = state; - // Timing results - std::vector res; - // Number of algorithm runs. We only record best times. - int64_t numruns = 15; - - // Allocate basic workspace - CHOLQR_vs_GEQRF_speed_benchmark_data all_data(rows_start, cols); - // Generate the input matrix - gaussian suffices for performance tests. - RandLAPACK::gen::mat_gen_info m_info(rows_start, cols, RandLAPACK::gen::gaussian); - RandLAPACK::gen::mat_gen(m_info, all_data.A, state); - - // Declare a data file - std::fstream file("CHOLQR_vs_GEQRF_time_raw_rows_start_" + std::to_string(rows_start) - + "_rows_end_" + std::to_string(rows_end) - + "_cols_" + std::to_string(cols) - + ".dat", std::fstream::app); - - int64_t total_time_cholqr = 0; - int64_t total_time_geqrf = 0; -#if !defined(__APPLE__) - for (int rows = rows_start; rows >= rows_end; rows /= 2) { - res = call_all_algs(rows, m_info, numruns, all_data, state_constant); - file << res[0] << " " << res[1] << "\n"; - total_time_cholqr += res[0]; - total_time_geqrf += res[1]; - } -#endif - printf("In total, CHOLQR takes %ld μs\n", total_time_cholqr); - printf("In total, GEQRF takes %ld μs\n\n", total_time_geqrf); -} \ No newline at end of file diff --git a/benchmark/CMakeLists.txt b/benchmark/CMakeLists.txt index 558f0ba0..ed608603 100644 --- a/benchmark/CMakeLists.txt +++ b/benchmark/CMakeLists.txt @@ -45,15 +45,19 @@ set( Benchmark_libs RandLAPACK ) -# Lapack functionality benchmarks -add_benchmark(NAME Chol_check CXX_SOURCES Chol_check.cc LINK_LIBS ${Benchmark_libs}) +# Performance profiling through GEMM +add_benchmark(NAME GEMM_flop_count CXX_SOURCES bench_general/GEMM_flop_count.cc LINK_LIBS ${Benchmark_libs}) +# Lapack functionality benchmark +add_benchmark(NAME Chol_check CXX_SOURCES bench_general/Chol_check.cc LINK_LIBS ${Benchmark_libs}) +# Data conversion helper script +add_benchmark(NAME convert_time CXX_SOURCES bench_general/convert_time.cc LINK_LIBS ${Benchmark_libs}) -# Data conversion helper scripts -add_benchmark(NAME convert_time CXX_SOURCES convert_time.cc LINK_LIBS ${Benchmark_libs}) +# CQRRPT benchmarks +add_benchmark(NAME CQRRPT_speed_comparisons CXX_SOURCES bench_CQRRPT/CQRRPT_speed_comparisons.cc LINK_LIBS ${Benchmark_libs}) +add_benchmark(NAME CQRRPT_runtime_breakdown CXX_SOURCES bench_CQRRPT/CQRRPT_runtime_breakdown.cc LINK_LIBS ${Benchmark_libs}) +add_benchmark(NAME CQRRPT_pivot_quality CXX_SOURCES bench_CQRRPT/CQRRPT_pivot_quality.cc LINK_LIBS ${Benchmark_libs}) -# QR speed comparisons with CQRRP benchmarks -add_benchmark(NAME CQRRP_speed_comparisons CXX_SOURCES CQRRP_speed_comparisons.cc LINK_LIBS ${Benchmark_libs}) -add_benchmark(NAME CHOLQR_vs_GEQRF CXX_SOURCES CHOLQR_vs_GEQRF.cc LINK_LIBS ${Benchmark_libs}) -add_benchmark(NAME CQRRP_block_per_time CXX_SOURCES CQRRP_block_per_time.cc LINK_LIBS ${Benchmark_libs}) -add_benchmark(NAME CQRRP_inner_speed CXX_SOURCES CQRRP_inner_speed.cc LINK_LIBS ${Benchmark_libs}) -add_benchmark(NAME CQRRP_accuracy CXX_SOURCES CQRRP_accuracy.cc LINK_LIBS ${Benchmark_libs}) \ No newline at end of file +# CQRRP benchmarks +add_benchmark(NAME CQRRP_speed_comparisons CXX_SOURCES bench_CQRRP/CQRRP_speed_comparisons.cc LINK_LIBS ${Benchmark_libs}) +add_benchmark(NAME CQRRP_runtime_breakdown CXX_SOURCES bench_CQRRP/CQRRP_runtime_breakdown.cc LINK_LIBS ${Benchmark_libs}) +add_benchmark(NAME CQRRP_pivot_quality CXX_SOURCES bench_CQRRP/CQRRP_pivot_quality.cc LINK_LIBS ${Benchmark_libs}) \ No newline at end of file diff --git a/benchmark/CQRRP_block_per_time.cc b/benchmark/CQRRP_block_per_time.cc deleted file mode 100644 index 6de40ab8..00000000 --- a/benchmark/CQRRP_block_per_time.cc +++ /dev/null @@ -1,150 +0,0 @@ -#include "RandLAPACK.hh" -#include "rl_blaspp.hh" -#include "rl_lapackpp.hh" -#include "rl_gen.hh" - -#include -#include - -template -struct QR_speed_benchmark_data { - int64_t row; - int64_t col; - T tolerance; - T sampling_factor; - std::vector A; - std::vector tau; - std::vector J; - - QR_speed_benchmark_data(int64_t m, int64_t n, T tol, T d_factor) : - A(m * n, 0.0), - tau(n, 0.0), - J(n, 0) - { - row = m; - col = n; - tolerance = tol; - sampling_factor = d_factor; - } -}; - -// Re-generate and clear data -template -static void data_regen(RandLAPACK::gen::mat_gen_info m_info, - QR_speed_benchmark_data &all_data, - RandBLAS::RNGState &state, int apply_itoa) { - - RandLAPACK::gen::mat_gen(m_info, all_data.A, state); - std::fill(all_data.tau.begin(), all_data.tau.end(), 0.0); - if (apply_itoa) { - std::iota(all_data.J.begin(), all_data.J.end(), 1); - } else { - std::fill(all_data.J.begin(), all_data.J.end(), 0); - } -} - -template -static void select_best(std::vector best, std::vector curr) { - for(int i = 0; i < (int) best.size(); ++i) { - if (curr[i] < best[i]) { best[i] = curr[i]; } - } -} - -template -static void call_all_algs( - RandLAPACK::gen::mat_gen_info m_info, - int64_t numruns, - int64_t b_sz, - QR_speed_benchmark_data &all_data, - RandBLAS::RNGState &state, - RandBLAS::RNGState &state_constant) { - - auto m = all_data.row; - auto n = all_data.col; - auto tol = all_data.tolerance; - auto d_factor = all_data.sampling_factor; - - // Additional params setup. - RandLAPACK::CQRRP_blocked CQRRP_blocked(false, tol, b_sz); - CQRRP_blocked.nnz = 2; - CQRRP_blocked.num_threads = 4; - CQRRP_blocked.timing_advanced = 1; - // We are nbot using panel pivoting in performance testing. - int panel_pivoting = 0; - - // timing vectors - std::vector best_time_cqrrpt(std::ceil(n / b_sz), 0.0); - std::vector best_time_hqrrp_geqrf(std::ceil(n / b_sz), 0.0); - std::vector best_time_hqrrp_cholqr(std::ceil(n / b_sz), 0.0); - - std::vector time_hqrrp_geqrf(std::ceil(n / b_sz), 0.0); - std::vector time_hqrrp_cholqr(std::ceil(n / b_sz), 0.0); - - - for (int i = 0; i < numruns; ++i) { - printf("ITERATION\n"); - // Testing CQRRP - CQRRP_blocked.call(m, n, all_data.A.data(), m, d_factor, all_data.tau.data(), all_data.J.data(), state); - // Update best timing - if (i == 0) { best_time_cqrrpt = CQRRP_blocked.block_per_time; } else { select_best(best_time_cqrrpt, CQRRP_blocked.block_per_time); } - - // Clear and re-generate data - data_regen(m_info, all_data, state_constant, 1); - - // Testing HQRRP with GEQRF - RandLAPACK::hqrrp(m, n, all_data.A.data(), m, all_data.J.data(), all_data.tau.data(), b_sz, (d_factor - 1) * b_sz, panel_pivoting, 0, state, time_hqrrp_geqrf.data()); - // Update best timing - - if(i == 0) { best_time_hqrrp_geqrf = time_hqrrp_geqrf; } else { select_best(best_time_hqrrp_geqrf, time_hqrrp_geqrf); } - - // Clear and re-generate data - data_regen(m_info, all_data, state_constant, 1); - - // Testing HQRRP with Cholqr - RandLAPACK::hqrrp(m, n, all_data.A.data(), m, all_data.J.data(), all_data.tau.data(), b_sz, (d_factor - 1) * b_sz, panel_pivoting, 1, state, time_hqrrp_cholqr.data()); - // Update best timing - if(i == 0) { best_time_hqrrp_cholqr = time_hqrrp_cholqr; } else { select_best(best_time_hqrrp_cholqr, time_hqrrp_cholqr);} - - // Clear and re-generate data - data_regen(m_info, all_data, state_constant, 0); - } - - // The actual output may be interpreted as - std::fstream file("QR_block_per_time_raw_rows_" + std::to_string(m) - + "_cols_" + std::to_string(n) - + "_b_sz_" + std::to_string(b_sz) - + "_d_factor_" + std::to_string(d_factor) - + ".dat", std::fstream::app); - - for (int i = 0; i < std::ceil(n / b_sz); ++i) { - file << best_time_cqrrpt[i] << " " << best_time_hqrrp_geqrf[i] << " " << best_time_hqrrp_cholqr[i] << "\n"; - } -} - -int main() { - // Declare parameters - int64_t m = std::pow(2, 14); - int64_t n = std::pow(2, 14); - double d_factor = 1.125; - int64_t b_sz_start = 1024; - int64_t b_sz_end = 1024; - double tol = std::pow(std::numeric_limits::epsilon(), 0.85); - auto state = RandBLAS::RNGState(); - auto state_constant = state; - // Timing results - std::vector res; - // Number of algorithm runs. We only record best times. - int64_t numruns = 5; - - // Allocate basic workspace - QR_speed_benchmark_data all_data(m, n, tol, d_factor); - // Generate the input matrix - gaussian suffices for performance tests. - RandLAPACK::gen::mat_gen_info m_info(m, n, RandLAPACK::gen::gaussian); - RandLAPACK::gen::mat_gen(m_info, all_data.A, state); - -#if !defined(__APPLE__) - for (;b_sz_start <= b_sz_end; b_sz_start *= 2) { - call_all_algs(m_info, numruns, b_sz_start, all_data, state, state_constant); - } -#endif -} diff --git a/benchmark/CQRRP_accuracy.cc b/benchmark/bench_CQRRP/CQRRP_pivot_quality.cc similarity index 100% rename from benchmark/CQRRP_accuracy.cc rename to benchmark/bench_CQRRP/CQRRP_pivot_quality.cc diff --git a/benchmark/CQRRP_inner_speed.cc b/benchmark/bench_CQRRP/CQRRP_runtime_breakdown.cc similarity index 100% rename from benchmark/CQRRP_inner_speed.cc rename to benchmark/bench_CQRRP/CQRRP_runtime_breakdown.cc diff --git a/benchmark/CQRRP_speed_comparisons.cc b/benchmark/bench_CQRRP/CQRRP_speed_comparisons.cc similarity index 100% rename from benchmark/CQRRP_speed_comparisons.cc rename to benchmark/bench_CQRRP/CQRRP_speed_comparisons.cc diff --git a/benchmark/bench_CQRRPT/CQRRPT_pivot_quality.cc b/benchmark/bench_CQRRPT/CQRRPT_pivot_quality.cc new file mode 100644 index 00000000..ac091fbb --- /dev/null +++ b/benchmark/bench_CQRRPT/CQRRPT_pivot_quality.cc @@ -0,0 +1,180 @@ +#include "RandLAPACK.hh" +#include "rl_blaspp.hh" +#include "rl_lapackpp.hh" +#include "rl_gen.hh" + +#include +#include + +template +struct QR_benchmark_data { + int64_t row; + int64_t col; + T tolerance; + T sampling_factor; + std::vector A; + std::vector R; + std::vector tau; + std::vector J; + std::vector S; + + QR_benchmark_data(int64_t m, int64_t n, T tol, T d_factor) : + A(m * n, 0.0), + R(n * n, 0.0), + tau(n, 0.0), + J(n, 0), + S(n, 0.0) + { + row = m; + col = n; + tolerance = tol; + sampling_factor = d_factor; + } +}; + +// Re-generate and clear data +template +static void data_regen(RandLAPACK::gen::mat_gen_info m_info, + QR_benchmark_data &all_data, + RandBLAS::RNGState &state) { + + RandLAPACK::gen::mat_gen(m_info, all_data.A, state); + std::fill(all_data.R.begin(), all_data.R.end(), 0.0); + std::fill(all_data.tau.begin(), all_data.tau.end(), 0.0); + std::fill(all_data.J.begin(), all_data.J.end(), 0); +} + +// Re-generate and clear data +template +static std::vector get_norms( QR_benchmark_data &all_data) { + + int64_t m = all_data.row; + int64_t n = all_data.col; + + std::vector R_norms (n, 0.0); + for (int i = 0; i < n; ++i) + R_norms[i] = lapack::lantr(Norm::Fro, Uplo::Upper, Diag::NonUnit, n - i, n - i, &all_data.A.data()[(m + 1) * i], m); + return R_norms; +} + +template +static void R_norm_ratio( + RandLAPACK::gen::mat_gen_info m_info, + QR_benchmark_data &all_data, + RandBLAS::RNGState &state) { + + auto m = all_data.row; + auto n = all_data.col; + auto tol = all_data.tolerance; + auto d_factor = all_data.sampling_factor; + + // Additional params setup. + RandLAPACK::CQRRPT CQRRPT(true, true, tol); + CQRRPT.nnz = 4; + CQRRPT.num_threads = 48; + + // Running HQRRP + lapack::geqp3(m, n, all_data.A.data(), m, all_data.J.data(), all_data.tau.data()); + std::vector R_norms_HQRRP = get_norms(all_data); + + // Clear and re-generate data + data_regen(m_info, all_data, state); + + // Running CQRRP + CQRRPT.call(m, n, all_data.A.data(), m, all_data.R.data(), n, all_data.J.data(), d_factor, state); + std::vector R_norms_CQRRPT = get_norms(all_data); + + // Declare a data file + std::fstream file1("data_out/QR_R_norm_ratios_rows_" + std::to_string(m) + + "_cols_" + std::to_string(n) + + "_d_factor_" + std::to_string(d_factor) + + ".dat", std::fstream::app); + + // Write the 1st metric info into a file. + for (int i = 0; i < n; ++i) + file1 << R_norms_HQRRP[i] / R_norms_CQRRPT[i] << ", "; +} + +template +static void sv_ratio( + RandLAPACK::gen::mat_gen_info m_info, + QR_benchmark_data &all_data, + RandBLAS::RNGState &state) { + + auto m = all_data.row; + auto n = all_data.col; + auto tol = all_data.tolerance; + auto d_factor = all_data.sampling_factor; + std::vector geqp3 (n, 0.0); + std::vector sv_ratios_cqrrp (n, 0.0); + + auto state1 = state; + + // Additional params setup. + RandLAPACK::CQRRPT CQRRPT(true, true, tol); + CQRRPT.nnz = 4; + CQRRPT.num_threads = 48; + + std::fstream file2("data_out/QR_sv_ratios_rows_" + std::to_string(m) + + "_cols_" + std::to_string(n) + + "_d_factor_" + std::to_string(d_factor) + + ".dat", std::fstream::app); + + T* R_dat = all_data.A.data(); + T* S_dat = all_data.S.data(); + + // Running SVD + lapack::gesdd(Job::NoVec, m, n, all_data.A.data(), m, all_data.S.data(), (T*) nullptr, m, (T*) nullptr, n); + + // Clear and re-generate data + data_regen(m_info, all_data, state); + + // Running GEQP3 + std::iota(all_data.J.begin(), all_data.J.end(), 1); + lapack::geqp3(m, n, all_data.A.data(), m, all_data.J.data(), all_data.tau.data()); + + // Write the 2nd metric info into a file. + for (int i = 0; i < n; ++i) + file2 << std::abs(R_dat[(m + 1) * i] / S_dat[i]) << ", "; + file2 << ",\n"; + + // Clear and re-generate data + data_regen(m_info, all_data, state1); + + // Running CQRRP + CQRRPT.call(m, n, all_data.A.data(), m, all_data.R.data(), n, all_data.J.data(), d_factor, state); + + // Write the 2nd metric info into a file. + for (int i = 0; i < n; ++i) + file2 << std::abs(R_dat[(m + 1) * i] / S_dat[i]) << ", "; +} + +int main() { + // Declare parameters + int64_t m = std::pow(2, 17); + int64_t n = std::pow(2, 11); + double d_factor = 1.25; + double tol = std::pow(std::numeric_limits::epsilon(), 0.85); + auto state = RandBLAS::RNGState(); + auto state_constant1 = state; + auto state_constant2 = state; + // results + std::vector res1; + std::vector res2; + + // Allocate basic workspace + QR_benchmark_data all_data(m, n, tol, d_factor); + // Generate the input matrix: + // polynomial & step for low coherence; + // spiked for high coherence. + RandLAPACK::gen::mat_gen_info m_info(m, n, RandLAPACK::gen::polynomial); + m_info.cond_num = std::pow(10, 10); + m_info.rank = n; + m_info.exponent = 2.0; + RandLAPACK::gen::mat_gen(m_info, all_data.A, state); + + R_norm_ratio(m_info, all_data, state_constant1); + printf("R done\n"); + sv_ratio(m_info, all_data, state_constant2); + printf("SV done\n\n"); +} \ No newline at end of file diff --git a/benchmark/bench_CQRRPT/CQRRPT_runtime_breakdown.cc b/benchmark/bench_CQRRPT/CQRRPT_runtime_breakdown.cc new file mode 100644 index 00000000..de694a88 --- /dev/null +++ b/benchmark/bench_CQRRPT/CQRRPT_runtime_breakdown.cc @@ -0,0 +1,120 @@ +#include "RandLAPACK.hh" +#include "rl_blaspp.hh" +#include "rl_lapackpp.hh" +#include "rl_gen.hh" + +#include +#include + +template +struct QR_benchmark_data { + int64_t row; + int64_t col; + T tolerance; + T sampling_factor; + std::vector A; + std::vector R; + std::vector tau; + std::vector J; + + QR_benchmark_data(int64_t m, int64_t n, T tol, T d_factor) : + A(m * n, 0.0), + R(n * n, 0.0), + tau(n, 0.0), + J(n, 0) + { + row = m; + col = n; + tolerance = tol; + sampling_factor = d_factor; + } +}; + +// Re-generate and clear data +template +static void data_regen(RandLAPACK::gen::mat_gen_info m_info, + QR_benchmark_data &all_data, + RandBLAS::RNGState &state) { + + RandLAPACK::gen::mat_gen(m_info, all_data.A, state); + std::fill(all_data.R.begin(), all_data.R.end(), 0.0); + std::fill(all_data.tau.begin(), all_data.tau.end(), 0.0); + std::fill(all_data.J.begin(), all_data.J.end(), 0); +} + +template +static std::vector call_all_algs( + RandLAPACK::gen::mat_gen_info m_info, + int64_t numruns, + int64_t n, + QR_benchmark_data &all_data, + RandBLAS::RNGState &state) { + + auto m = all_data.row; + auto tol = all_data.tolerance; + auto d_factor = all_data.sampling_factor; + + // Additional params setup. + RandLAPACK::CQRRPT CQRRPT(true, true, tol); + CQRRPT.nnz = 4; + CQRRPT.num_threads = 8; + + // Making sure the states are unchanged + auto state_alg = state; + auto state_gen = state; + + // Timing vars + long dur_cqrrpt = 0; + long t_cqrrpt_best = 0; + std::vector inner_timing_best; + + for (int i = 0; i < numruns; ++i) { + printf("Iteration %d start.\n", i); + auto start_cqrrpt = high_resolution_clock::now(); + CQRRPT.call(m, n, all_data.A.data(), m, all_data.R.data(), n, all_data.J.data(), d_factor, state_alg); + auto stop_cqrrpt = high_resolution_clock::now(); + dur_cqrrpt = duration_cast(stop_cqrrpt - start_cqrrpt).count(); + // Update best timing + if (!i || dur_cqrrpt < t_cqrrpt_best) {t_cqrrpt_best = dur_cqrrpt; inner_timing_best = CQRRPT.times;} + // Making sure the states are unchanged + state_alg = state; + state_gen = state; + // Clear and re-generate data + data_regen(m_info, all_data, state_gen); + } + + return inner_timing_best; +} + +int main() { + // Declare parameters + int64_t m = std::pow(2, 12); + int64_t n_start = std::pow(2, 5); + int64_t n_stop = std::pow(2, 5); + double d_factor = 1.25; + double tol = std::pow(std::numeric_limits::epsilon(), 0.85); + auto state = RandBLAS::RNGState(); + auto state_constant = state; + // Timing results + std::vector res; + // Number of algorithm runs. We only record best times. + int64_t numruns = 25; + + // Allocate basic workspace at its max size. + QR_benchmark_data all_data(m, n_stop, tol, d_factor); + // Generate the input matrix - gaussian suffices for performance tests. + RandLAPACK::gen::mat_gen_info m_info(m, n_stop, RandLAPACK::gen::gaussian); + RandLAPACK::gen::mat_gen(m_info, all_data.A, state); + + // Declare a data file + std::fstream file("CQRRPT_inner_speed_" + std::to_string(m) + + "_col_start_" + std::to_string(n_start) + + "_col_stop_" + std::to_string(n_stop) + + "_d_factor_" + std::to_string(d_factor) + + ".dat", std::fstream::app); + + for (;n_start <= n_stop; n_start *= 2) { + res = call_all_algs(m_info, numruns, n_start, all_data, state_constant); + file << res[0] << ", " << res[1] << ", " << res[2] << ", " << res[3] << ", " << res[4] << ", " << res[5] << ", " << res[6] << ", " << res[7] << ",\n"; + } +} \ No newline at end of file diff --git a/benchmark/bench_CQRRPT/CQRRPT_speed_comparisons.cc b/benchmark/bench_CQRRPT/CQRRPT_speed_comparisons.cc new file mode 100644 index 00000000..8791074a --- /dev/null +++ b/benchmark/bench_CQRRPT/CQRRPT_speed_comparisons.cc @@ -0,0 +1,202 @@ +#include "RandLAPACK.hh" +#include "rl_blaspp.hh" +#include "rl_lapackpp.hh" +#include "rl_gen.hh" + +#include +#include + +template +struct QR_benchmark_data { + int64_t row; + int64_t col; + T tolerance; + T sampling_factor; + std::vector A; + std::vector R; + std::vector tau; + std::vector J; + + QR_benchmark_data(int64_t m, int64_t n, T tol, T d_factor) : + A(m * n, 0.0), + R(n * n, 0.0), + tau(n, 0.0), + J(n, 0) + { + row = m; + col = n; + tolerance = tol; + sampling_factor = d_factor; + } +}; + +// Re-generate and clear data +template +static void data_regen(RandLAPACK::gen::mat_gen_info m_info, + QR_benchmark_data &all_data, + RandBLAS::RNGState &state) { + + RandLAPACK::gen::mat_gen(m_info, all_data.A, state); + std::fill(all_data.R.begin(), all_data.R.end(), 0.0); + std::fill(all_data.tau.begin(), all_data.tau.end(), 0.0); + std::fill(all_data.J.begin(), all_data.J.end(), 0); +} + +template +static std::vector call_all_algs( + RandLAPACK::gen::mat_gen_info m_info, + int64_t numruns, + int64_t n, + QR_benchmark_data &all_data, + RandBLAS::RNGState &state) { + + auto m = all_data.row; + auto tol = all_data.tolerance; + auto d_factor = all_data.sampling_factor; + + // Additional params setup. + RandLAPACK::CQRRPT CQRRPT(true, true, tol); + CQRRPT.nnz = 4; + CQRRPT.num_threads = 48; + + // timing vars + long dur_cqrrpt = 0; + long dur_geqp3 = 0; + long dur_geqr = 0; + long dur_geqpt = 0; + long dur_geqrf = 0; + long dur_scholqr = 0; + long t_cqrrpt_best = 0; + long t_geqp3_best = 0; + long t_geqr_best = 0; + long t_geqpt_best = 0; + long t_geqrf_best = 0; + long t_scholqr_best = 0; + + // Making sure the states are unchanged + auto state_gen = state; + auto state_alg = state; + + for (int i = 0; i < numruns; ++i) { + printf("Iteration %d start.\n", i); + // Testing GEQP3 + auto start_geqp3 = high_resolution_clock::now(); + lapack::geqp3(m, n, all_data.A.data(), m, all_data.J.data(), all_data.tau.data()); + auto stop_geqp3 = high_resolution_clock::now(); + dur_geqp3 = duration_cast(stop_geqp3 - start_geqp3).count(); + + state_gen = state; + data_regen(m_info, all_data, state_gen); + + // Testing GEQRF + auto start_geqrf = high_resolution_clock::now(); + lapack::geqrf(m, n, all_data.A.data(), m, all_data.tau.data()); + auto stop_geqrf = high_resolution_clock::now(); + dur_geqrf = duration_cast(stop_geqrf - start_geqrf).count(); + + state_gen = state; + data_regen(m_info, all_data, state_gen); + + // Testing CQRRPT + auto start_cqrrp = high_resolution_clock::now(); + CQRRPT.call(m, n, all_data.A.data(), m, all_data.R.data(), n, all_data.J.data(), d_factor, state_alg); + auto stop_cqrrp = high_resolution_clock::now(); + dur_cqrrpt = duration_cast(stop_cqrrp - start_cqrrp).count(); + + state_gen = state; + state_alg = state; + data_regen(m_info, all_data, state_gen); + + // Testing SCHOLQR3 + auto start_scholqr = high_resolution_clock::now(); + //--------------------------------------------------------------------------------------------------------------------------// + T norm_A = lapack::lange(Norm::Fro, m, n, all_data.A.data(), m); + T shift = 11 * std::numeric_limits::epsilon() * n * std::pow(norm_A, 2); + blas::syrk(Layout::ColMajor, Uplo::Upper, Op::Trans, n, m, 1.0, all_data.A.data(), m, 0.0, all_data.R.data(), n); + for (int i = 0; i < n; ++i) + all_data.R[i * (n + 1)] += shift; + lapack::potrf(Uplo::Upper, n, all_data.R.data(), n); + blas::trsm(Layout::ColMajor, Side::Right, Uplo::Upper, Op::NoTrans, Diag::NonUnit, m, n, 1.0, all_data.R.data(), n, all_data.A.data(), m); + // CholeskyQR2 + blas::syrk(Layout::ColMajor, Uplo::Upper, Op::Trans, n, m, 1.0, all_data.A.data(), m, 0.0, all_data.R.data(), n); + lapack::potrf(Uplo::Upper, n, all_data.R.data(), n); + blas::trsm(Layout::ColMajor, Side::Right, Uplo::Upper, Op::NoTrans, Diag::NonUnit, m, n, 1.0, all_data.R.data(), n, all_data.A.data(), m); + // CholeskyQR3 + blas::syrk(Layout::ColMajor, Uplo::Upper, Op::Trans, n, m, 1.0, all_data.A.data(), m, 0.0, all_data.R.data(), n); + lapack::potrf(Uplo::Upper, n, all_data.R.data(), n); + blas::trsm(Layout::ColMajor, Side::Right, Uplo::Upper, Op::NoTrans, Diag::NonUnit, m, n, 1.0, all_data.R.data(), n, all_data.A.data(), m); + //--------------------------------------------------------------------------------------------------------------------------// + auto stop_scholqr = high_resolution_clock::now(); + dur_scholqr = duration_cast(stop_scholqr - start_scholqr).count(); + + auto state_gen = state; + data_regen(m_info, all_data, state_gen); + + // Testing GEQR + GEQPT + auto start_geqpt = high_resolution_clock::now(); + auto start_geqr = high_resolution_clock::now(); +#if !defined(__APPLE__) + // GEQR(A) part + lapack::geqr(m, n, all_data.A.data(), m, all_data.tau.data(), -1); + int64_t tsize = (int64_t) all_data.tau[0]; + all_data.tau.resize(tsize); + lapack::geqr(m, n, all_data.A.data(), m, all_data.tau.data(), tsize); +#endif + auto stop_geqr = high_resolution_clock::now(); + dur_geqr = duration_cast(stop_geqr - start_geqr).count(); +#if !defined(__APPLE__) + // GEQP3(R) part + lapack::lacpy(MatrixType::Upper, n, n, all_data.A.data(), m, all_data.R.data(), n); + lapack::geqp3(n, n, all_data.R.data(), n, all_data.J.data(), all_data.tau.data()); +#endif + auto stop_geqpt = high_resolution_clock::now(); + dur_geqpt = duration_cast(stop_geqpt - start_geqpt).count(); + + state_gen = state; + data_regen(m_info, all_data, state_gen); + + i == 0 ? t_cqrrpt_best = dur_cqrrpt : (dur_cqrrpt < t_cqrrpt_best) ? t_cqrrpt_best = dur_cqrrpt : NULL; + i == 0 ? t_geqpt_best = dur_geqpt : (dur_geqpt < t_geqpt_best) ? t_geqpt_best = dur_geqpt : NULL; + i == 0 ? t_geqrf_best = dur_geqrf : (dur_geqrf < t_geqrf_best) ? t_geqrf_best = dur_geqrf : NULL; + i == 0 ? t_geqr_best = dur_geqr : (dur_geqr < t_geqr_best) ? t_geqr_best = dur_geqr : NULL; + i == 0 ? t_geqp3_best = dur_geqp3 : (dur_geqp3 < t_geqp3_best) ? t_geqp3_best = dur_geqp3 : NULL; + i == 0 ? t_scholqr_best = dur_scholqr : (dur_scholqr < t_scholqr_best) ? t_scholqr_best = dur_scholqr : NULL; + } + + std::vector res{t_cqrrpt_best, t_geqpt_best, t_geqrf_best, t_geqr_best, t_geqp3_best, t_scholqr_best}; + + return res; +} + +int main() { + // Declare parameters + int64_t m = std::pow(2, 17); + int64_t n_start = std::pow(2, 9); + int64_t n_stop = std::pow(2, 13); + double d_factor = 1.25; + double tol = std::pow(std::numeric_limits::epsilon(), 0.85); + auto state = RandBLAS::RNGState(); + auto state_constant = state; + // Timing results + std::vector res; + // Number of algorithm runs. We only record best times. + int64_t numruns = 1; + + // Allocate basic workspace + QR_benchmark_data all_data(m, n_stop, tol, d_factor); + // Generate the input matrix - gaussian suffices for performance tests. + RandLAPACK::gen::mat_gen_info m_info(m, n_stop, RandLAPACK::gen::gaussian); + RandLAPACK::gen::mat_gen(m_info, all_data.A, state); + + // Declare a data file + std::fstream file("CQRRPT_speed_comp_" + std::to_string(m) + + "_col_start_" + std::to_string(n_start) + + "_col_stop_" + std::to_string(n_stop) + + "_d_factor_" + std::to_string(d_factor) + + ".dat", std::fstream::app); + + for (;n_start <= n_stop; n_start *= 2) { + res = call_all_algs(m_info, numruns, n_start, all_data, state_constant); + file << res[0] << ", " << res[1] << ", " << res[2] << ", " << res[3] << ", " << res[4] << ", " << res[5] << ",\n"; + } +} \ No newline at end of file diff --git a/benchmark/Chol_check.cc b/benchmark/bench_general/Chol_check.cc similarity index 100% rename from benchmark/Chol_check.cc rename to benchmark/bench_general/Chol_check.cc diff --git a/benchmark/GEMM_flop_count.cc b/benchmark/bench_general/GEMM_flop_count.cc similarity index 100% rename from benchmark/GEMM_flop_count.cc rename to benchmark/bench_general/GEMM_flop_count.cc diff --git a/benchmark/convert_time.cc b/benchmark/bench_general/convert_time.cc similarity index 100% rename from benchmark/convert_time.cc rename to benchmark/bench_general/convert_time.cc diff --git a/test/drivers/test_cqrrpt.cc b/test/drivers/test_cqrrpt.cc index 1f5551fc..9a130555 100644 --- a/test/drivers/test_cqrrpt.cc +++ b/test/drivers/test_cqrrpt.cc @@ -30,6 +30,7 @@ class TestCQRRPT : public ::testing::Test CQRRPTTestData(int64_t m, int64_t n, int64_t k) : A(m * n, 0.0), + R(n * n, 0.0), J(n, 0), A_cpy1(m * n, 0.0), A_cpy2(m * n, 0.0), @@ -76,7 +77,7 @@ class TestCQRRPT : public ::testing::Test T norm_0 = lapack::lansy(lapack::Norm::Fro, Uplo::Upper, k, I_ref_dat, k); // A - QR - blas::gemm(Layout::ColMajor, Op::NoTrans, Op::NoTrans, m, n, k, 1.0, Q_dat, m, R_dat, k, -1.0, A_dat, m); + blas::gemm(Layout::ColMajor, Op::NoTrans, Op::NoTrans, m, n, k, 1.0, Q_dat, m, R_dat, n, -1.0, A_dat, m); // Implementing max col norm metric T max_col_norm = 0.0; @@ -106,7 +107,7 @@ class TestCQRRPT : public ::testing::Test /// Computes QR factorzation, and computes A[:, J] - QR. template static void test_CQRRPT_general( - int64_t d, + T d_factor, T norm_A, CQRRPTTestData &all_data, alg_type &CQRRPT, @@ -115,7 +116,7 @@ class TestCQRRPT : public ::testing::Test auto m = all_data.row; auto n = all_data.col; - CQRRPT.call(m, n, all_data.A, d, all_data.R, all_data.J, state); + CQRRPT.call(m, n, all_data.A.data(), m, all_data.R.data(), n, all_data.J.data(), d_factor, state); all_data.rank = CQRRPT.rank; printf("RANK AS RETURNED BY CQRRPT %ld\n", all_data.rank); @@ -124,7 +125,6 @@ class TestCQRRPT : public ::testing::Test RandLAPACK::util::col_swap(m, n, n, all_data.A_cpy2.data(), m, all_data.J); error_check(norm_A, all_data); - } }; @@ -133,7 +133,7 @@ TEST_F(TestCQRRPT, CQRRPT_full_rank_no_hqrrp) { int64_t m = 10000; int64_t n = 200; int64_t k = 200; - int64_t d = 400; + double d_factor = 2; double norm_A = 0; double tol = std::pow(std::numeric_limits::epsilon(), 0.85); auto state = RandBLAS::RNGState(); @@ -151,14 +151,14 @@ TEST_F(TestCQRRPT, CQRRPT_full_rank_no_hqrrp) { RandLAPACK::gen::mat_gen(m_info, all_data.A, state); norm_and_copy_computational_helper(norm_A, all_data); - test_CQRRPT_general>(d, norm_A, all_data, CQRRPT, state); + test_CQRRPT_general>(d_factor, norm_A, all_data, CQRRPT, state); } TEST_F(TestCQRRPT, CQRRPT_low_rank_with_hqrrp) { int64_t m = 10000; int64_t n = 200; int64_t k = 100; - int64_t d = 400; + double d_factor = 2; double norm_A = 0; double tol = std::pow(std::numeric_limits::epsilon(), 0.85); auto state = RandBLAS::RNGState(); @@ -176,7 +176,7 @@ TEST_F(TestCQRRPT, CQRRPT_low_rank_with_hqrrp) { RandLAPACK::gen::mat_gen(m_info, all_data.A, state); norm_and_copy_computational_helper(norm_A, all_data); - test_CQRRPT_general>(d, norm_A, all_data, CQRRPT, state); + test_CQRRPT_general>(d_factor, norm_A, all_data, CQRRPT, state); } // Using L2 norm rank estimation here is similar to using raive estimation. @@ -185,7 +185,7 @@ TEST_F(TestCQRRPT, CQRRPT_bad_orth) { int64_t m = 10e4; int64_t n = 300; int64_t k = 0; - int64_t d = 300; + double d_factor = 1; double norm_A = 0; double tol = std::pow(std::numeric_limits::epsilon(), 0.75); auto state = RandBLAS::RNGState(); @@ -201,7 +201,7 @@ TEST_F(TestCQRRPT, CQRRPT_bad_orth) { RandLAPACK::gen::mat_gen(m_info, all_data.A, state); norm_and_copy_computational_helper(norm_A, all_data); - test_CQRRPT_general>(d, norm_A, all_data, CQRRPT, state); + test_CQRRPT_general>(d_factor, norm_A, all_data, CQRRPT, state); }