From 1089aa4f71e6ce2fd45b8edbcb15c1b67d4f154b Mon Sep 17 00:00:00 2001 From: TeachRaccooon Date: Mon, 12 Feb 2024 07:48:35 -0800 Subject: [PATCH] Reworking of RBKI speed benchmark. --- RandLAPACK/misc/rl_gen.hh | 2 +- .../bench_RBKI/RBKI_speed_comparisons.cc | 190 +++++------------- 2 files changed, 47 insertions(+), 145 deletions(-) diff --git a/RandLAPACK/misc/rl_gen.hh b/RandLAPACK/misc/rl_gen.hh index 4f866b08..2490782b 100644 --- a/RandLAPACK/misc/rl_gen.hh +++ b/RandLAPACK/misc/rl_gen.hh @@ -420,7 +420,7 @@ void gen_kahan_mat( free(C); } -/// Generates Kahan matrix +/// Reads a matrix from a file template void process_input_mat( int64_t &m, diff --git a/benchmark/bench_RBKI/RBKI_speed_comparisons.cc b/benchmark/bench_RBKI/RBKI_speed_comparisons.cc index 9cf9150f..f41daf2c 100644 --- a/benchmark/bench_RBKI/RBKI_speed_comparisons.cc +++ b/benchmark/bench_RBKI/RBKI_speed_comparisons.cc @@ -10,28 +10,24 @@ template struct RBKI_benchmark_data { int64_t row; int64_t col; - int64_t rank; // has to be modifiable T tolerance; std::vector A; std::vector U; std::vector V; std::vector Sigma; std::vector Sigma_cpy_RBKI; - std::vector Sigma_cpy_SVD; - std::vector Sigma_cpy_Other; + std::vector Sigma_SVD; - RBKI_benchmark_data(int64_t m, int64_t n, int64_t k, T tol) : + RBKI_benchmark_data(int64_t m, int64_t n, T tol) : A(m * n, 0.0), U(m * n, 0.0), V(n * n, 0.0), Sigma(n, 0.0), Sigma_cpy_RBKI(n, 0.0), - Sigma_cpy_SVD(n, 0.0), - Sigma_cpy_Other(n, 0.0) + Sigma_SVD(n, 0.0) { row = m; col = n; - rank = k; tolerance = tol; } }; @@ -50,172 +46,76 @@ static void data_regen(RandLAPACK::gen::mat_gen_info m_info, } template -static void update_best_time(int iter, long &t_best, long &t_curr, T* S1, T* S2, int64_t k, long* break_in, long* break_out, int timing) +static void update_best_time(int iter, long &t_best, long &t_curr, T* S1, T* S2, int64_t target_rank) { if (iter == 0 || t_curr < t_best) { t_best = t_curr; - blas::copy(k, S1, 1, S2, 1); + blas::copy(target_rank, S1, 1, S2, 1); } - if (timing) - blas::copy(13, break_out, 1, break_in, 1); } -template -static long run_svd( - RandLAPACK::gen::mat_gen_info m_info, - RBKI_benchmark_data &all_data, - RandBLAS::RNGState &state) -{ - auto m = all_data.row; - auto n = all_data.col; - auto tol = all_data.tolerance; - - // Using this call for BLAS/LAPACK warmup - lapack::gesdd(Job::NoVec, 10, 10, all_data.A.data(), 10, all_data.Sigma.data(), all_data.U.data(), 10, all_data.V.data(), 10); - auto state_gen = state; - data_regen(m_info, all_data, state_gen, 1); - - // Testing Other - SVD - auto start_svd = high_resolution_clock::now(); - lapack::gesdd(Job::NoVec, m, n, all_data.A.data(), m, all_data.Sigma.data(), all_data.U.data(), m, all_data.V.data(), n); - auto stop_svd = high_resolution_clock::now(); - long dur_svd = duration_cast(stop_svd - start_svd).count(); - - blas::copy(n, all_data.Sigma.data(), 1, all_data.Sigma_cpy_SVD.data(), 1); - - state_gen = state; - data_regen(m_info, all_data, state_gen, 1); - - return dur_svd; -} - - template static void call_all_algs( RandLAPACK::gen::mat_gen_info m_info, int64_t numruns, - int64_t k, - int64_t num_krylov_iters, + int64_t b_sz, + int64_t target_rank, RBKI_benchmark_data &all_data, RandBLAS::RNGState &state, std::string output_filename, long dur_svd) { + printf("\nBlock size %ld, target rank %ld\n", b_sz, target_rank); int i, j; auto m = all_data.row; auto n = all_data.col; auto tol = all_data.tolerance; T norm_svd_k; - T norm_svd_lanc; T err_rbki; - T err_lan; - int64_t k_lanc = std::min((int64_t) (num_krylov_iters / (T) 2), k); - bool time_subroutines = true; - - // Set the threshold for Lanchosz - // Setting up Lanchosz - RBKI with k = 1. - RandLAPACK::RBKI Lanchosz(false, false, tol); - Lanchosz.max_krylov_iters = num_krylov_iters; + bool time_subroutines = false; // Additional params setup. RandLAPACK::RBKI RBKI(false, time_subroutines, tol); - RBKI.max_krylov_iters = num_krylov_iters; + // Matrices R or S that give us the singular value spectrum returned by RBKI will be of size b_sz * num_krylov_iters / 2. + // These matrices will be full-rank. + // Hence, target_rank = b_sz * num_krylov_iters / 2 + RBKI.max_krylov_iters = (int) ((target_rank * 2) / b_sz); // timing vars - long dur_rbki = 0; - long dur_lanchosz = 0; - long t_rbki_best = 0; - long t_lanchosz_best = 0; + long dur_rbki = 0; + long t_rbki_best = 0; // Making sure the states are unchanged auto state_gen = state; - //auto state_alg = state; - - // Timing breakdown vectors; - std::vector Lanc_timing_breakdown (13, 0.0); - std::vector RBKI_timing_breakdown (13, 0.0); for (i = 0; i < numruns; ++i) { printf("Iteration %d start.\n", i); - - // Testing Lanchosz - auto start_lanchosz = high_resolution_clock::now(); - Lanchosz.call(m, n, all_data.A.data(), m, 1, all_data.U.data(), all_data.V.data(), all_data.Sigma.data(), state); - auto stop_lanchosz = high_resolution_clock::now(); - dur_lanchosz = duration_cast(stop_lanchosz - start_lanchosz).count(); - - // Update best timing and save the singular values. - update_best_time(i, t_lanchosz_best, dur_lanchosz, all_data.Sigma.data(), all_data.Sigma_cpy_Other.data(), k_lanc, Lanc_timing_breakdown.data(), Lanchosz.times.data(), false); - - state_gen = state; - data_regen(m_info, all_data, state_gen, 0); // Testing RBKI auto start_rbki = high_resolution_clock::now(); - RBKI.call(m, n, all_data.A.data(), m, k, all_data.U.data(), all_data.V.data(), all_data.Sigma.data(), state); - + RBKI.call(m, n, all_data.A.data(), m, b_sz, all_data.U.data(), all_data.V.data(), all_data.Sigma.data(), state); auto stop_rbki = high_resolution_clock::now(); dur_rbki = duration_cast(stop_rbki - start_rbki).count(); // Update best timing and save the singular values. - update_best_time(i, t_rbki_best, dur_rbki, all_data.Sigma.data(), all_data.Sigma_cpy_RBKI.data(), k, RBKI_timing_breakdown.data(), RBKI.times.data(), time_subroutines); + update_best_time(i, t_rbki_best, dur_rbki, all_data.Sigma.data(), all_data.Sigma_cpy_RBKI.data(), target_rank); state_gen = state; data_regen(m_info, all_data, state_gen, 0); } - for(j = 0; j < k; ++j) - all_data.Sigma_cpy_RBKI[j] -= all_data.Sigma_cpy_SVD[j]; + for(j = 0; j < target_rank; ++j) + all_data.Sigma_cpy_RBKI[j] -= all_data.Sigma_SVD[j]; - for(j = 0; j < k_lanc; ++j) - all_data.Sigma_cpy_Other[j] -= all_data.Sigma_cpy_SVD[j]; - - norm_svd_k = blas::nrm2(k, all_data.Sigma_cpy_SVD.data(), 1); - norm_svd_lanc = blas::nrm2(k_lanc, all_data.Sigma_cpy_SVD.data(), 1); - - err_rbki = blas::nrm2(k, all_data.Sigma_cpy_RBKI.data(), 1) / norm_svd_k; - err_lan = blas::nrm2(k_lanc, all_data.Sigma_cpy_Other.data(), 1) / norm_svd_lanc; - - if (time_subroutines) { - printf("\n\n/------------RBKI TIMING RESULTS BEGIN------------/\n"); - printf("Basic info: b_sz=%ld krylov_iters=%ld\n", k, num_krylov_iters); - - printf("Allocate and free time: %25ld μs,\n", RBKI_timing_breakdown[0]); - printf("Time to acquire the SVD factors: %25ld μs,\n", RBKI_timing_breakdown[1]); - printf("UNGQR time: %25ld μs,\n", RBKI_timing_breakdown[2]); - printf("Reorthogonalization time: %25ld μs,\n", RBKI_timing_breakdown[3]); - printf("QR time: %25ld μs,\n", RBKI_timing_breakdown[4]); - printf("GEMM A time: %25ld μs,\n", RBKI_timing_breakdown[5]); - printf("Sketching time: %25ld μs,\n", RBKI_timing_breakdown[7]); - printf("R_ii cpy time: %25ld μs,\n", RBKI_timing_breakdown[8]); - printf("S_ii cpy time: %25ld μs,\n", RBKI_timing_breakdown[9]); - printf("Norm time: %25ld μs,\n", RBKI_timing_breakdown[10]); - - printf("\nAllocation takes %22.2f%% of runtime.\n", 100 * ((T) RBKI_timing_breakdown[0] / (T) RBKI_timing_breakdown[12])); - printf("Factors takes %22.2f%% of runtime.\n", 100 * ((T) RBKI_timing_breakdown[1] / (T) RBKI_timing_breakdown[12])); - printf("Ungqr takes %22.2f%% of runtime.\n", 100 * ((T) RBKI_timing_breakdown[2] / (T) RBKI_timing_breakdown[12])); - printf("Reorth takes %22.2f%% of runtime.\n", 100 * ((T) RBKI_timing_breakdown[3] / (T) RBKI_timing_breakdown[12])); - printf("QR takes %22.2f%% of runtime.\n", 100 * ((T) RBKI_timing_breakdown[4] / (T) RBKI_timing_breakdown[12])); - printf("GEMM A takes %22.2f%% of runtime.\n", 100 * ((T) RBKI_timing_breakdown[5] / (T) RBKI_timing_breakdown[12])); - printf("Sketching takes %22.2f%% of runtime.\n", 100 * ((T) RBKI_timing_breakdown[7] / (T) RBKI_timing_breakdown[12])); - printf("R_ii cpy takes %22.2f%% of runtime.\n", 100 * ((T) RBKI_timing_breakdown[8] / (T) RBKI_timing_breakdown[12])); - printf("S_ii cpy takes %22.2f%% of runtime.\n", 100 * ((T) RBKI_timing_breakdown[9] / (T) RBKI_timing_breakdown[12])); - printf("Norm R takes %22.2f%% of runtime.\n", 100 * ((T) RBKI_timing_breakdown[10] / (T) RBKI_timing_breakdown[12])); - printf("Rest takes %22.2f%% of runtime.\n", 100 * ((T) RBKI_timing_breakdown[11] / (T) RBKI_timing_breakdown[12])); - - printf("\nMain loop takes %22.2f%% of runtime.\n", 100 * ((T) RBKI_timing_breakdown[6] / (T) RBKI_timing_breakdown[12])); - printf("/-------------RBKI TIMING RESULTS END-------------/\n\n"); - } + norm_svd_k = blas::nrm2(target_rank, all_data.Sigma_SVD.data(), 1); + err_rbki = blas::nrm2(target_rank, all_data.Sigma_cpy_RBKI.data(), 1) / norm_svd_k; // Print accuracy info printf("||Sigma_ksvd - Sigma_rbki||_F / ||Sigma_ksvd||_F: %.16e\n", err_rbki); - printf("||Sigma_ksvd - Sigma_lanc||_F / ||Sigma_lanc||_F: %.16e\n", err_lan); - - printf("RBKI is %f times faster that SVD.\n", (T) dur_svd / t_rbki_best); - printf("Lanchosz is %f times faster that SVD.\n", (T) dur_svd / t_lanchosz_best); + printf("RBKI is %f times faster that SVD.\n", (T) dur_svd / t_rbki_best); std::ofstream file(output_filename, std::ios::app); - file << k << ", " << num_krylov_iters << ", " << err_rbki << ", " << err_lan << ", " << t_rbki_best << ", " << dur_svd << ", " << t_lanchosz_best << ",\n"; + file << b_sz << ", " << RBKI.max_krylov_iters << ", " << target_rank << ", " << err_rbki << ", " << t_rbki_best << ", " << dur_svd << ",\n"; } int main(int argc, char *argv[]) { @@ -229,15 +129,16 @@ int main(int argc, char *argv[]) { int64_t m = 0; int64_t n = 0; - int64_t k_start = 0; - int64_t k_stop = 0; - int64_t num_krylov_iters_start = 2; - int64_t num_krylov_iters_curr = num_krylov_iters_start; - int64_t num_krylov_iters_stop = 64; + int64_t b_sz_start = 0; + int64_t b_sz_stop = 0; + int64_t target_rank_start = 256; + int64_t target_rank_curr = target_rank_start; + int64_t target_rank_stop = 4096; double tol = std::pow(std::numeric_limits::epsilon(), 0.85); auto state = RandBLAS::RNGState(); auto state_constant = state; - int numruns = 10; + int numruns = 5; + long dur_svd = 0; std::vector res; // Generate the input matrix. @@ -250,33 +151,34 @@ int main(int argc, char *argv[]) { // Update basic params. m = m_info.rows; n = m_info.cols; - k_start = 2;//std::max((int64_t) 1, n / 10); - k_stop = 256;//std::max((int64_t) 1, n / 10); + b_sz_start = 2;//std::max((int64_t) 1, n / 10); + b_sz_stop = 128;//std::max((int64_t) 1, n / 10); // Allocate basic workspace. - RBKI_benchmark_data all_data(m, n, k_stop, tol); + RBKI_benchmark_data all_data(m, n, tol); // Fill the data matrix; RandLAPACK::gen::mat_gen(m_info, all_data.A.data(), state); + // Read the singular vectors from argv2 + int64_t buf1 = 1; + int buf2 = 0; + RandLAPACK::gen::process_input_mat(m, buf1, all_data.Sigma_SVD.data(), argv[2], buf2); printf("Finished data preparation\n"); // Declare a data file - std::string output_filename = "RBKI_speed_comp_m_" + std::to_string(m) + std::string output_filename = "RBKI_speed_comp_m_" + std::to_string(m) + "_n_" + std::to_string(n) - + "_k_start_" + std::to_string(k_start) - + "_k_stop_" + std::to_string(k_stop) - + "_num_krylov_iters_start_" + std::to_string(num_krylov_iters_start) - + "_num_krylov_iters_stop_" + std::to_string(num_krylov_iters_stop) + + "_b_sz_start_" + std::to_string(b_sz_start) + + "_b_sz_stop_" + std::to_string(b_sz_stop) + + "_num_krylov_iters_start_" + std::to_string(target_rank_start) + + "_num_krylov_iters_stop_" + std::to_string(target_rank_stop) + ".dat"; - // SVD run takes very long & is only needed once for all sizes - long dur_svd = run_svd(m_info, all_data, state); - - for (;k_start <= k_stop; k_start *=2) { - for (;num_krylov_iters_curr <= num_krylov_iters_stop; num_krylov_iters_curr *=2) { - call_all_algs(m_info, numruns, k_start, num_krylov_iters_curr, all_data, state_constant, output_filename, dur_svd); + for (;b_sz_start <= b_sz_stop; b_sz_start *=2) { + for (;target_rank_curr <= target_rank_stop; target_rank_curr *=2) { + call_all_algs(m_info, numruns, b_sz_start, target_rank_curr, all_data, state_constant, output_filename, dur_svd); } - num_krylov_iters_curr = num_krylov_iters_start; + target_rank_curr = target_rank_start; } } \ No newline at end of file