Skip to content

Commit

Permalink
RBKI benchmark update, print statements in
Browse files Browse the repository at this point in the history
  • Loading branch information
TeachRaccooon committed Feb 28, 2024
1 parent 28ee6ca commit d8a1fd2
Show file tree
Hide file tree
Showing 2 changed files with 122 additions and 42 deletions.
50 changes: 39 additions & 11 deletions RandLAPACK/drivers/rl_rbki.hh
Original file line number Diff line number Diff line change
Expand Up @@ -127,7 +127,7 @@ int RBKI<T, RNG>::call(
int64_t iter = 0, iter_od = 0, iter_ev = 0, i = 0, end_rows = 0, end_cols = 0;
T norm_R = 0;
int64_t space_rows = k * std::ceil(m / (T) k);
int max_iters = std::min(this->max_krylov_iters, (int) (n / (T) k));
int max_iters = this->max_krylov_iters;//std::min(this->max_krylov_iters, (int) (n / (T) k));

// We need a full copy of X and Y all the way through the algorithm
// due to an operation with X_odd and Y_odd happening at the end.
Expand All @@ -140,7 +140,12 @@ int RBKI<T, RNG>::call(
// While R and S matrices are structured (both band), we cannot make use of this structure through
// BLAS-level functions.
// Note also that we store a transposed version of R.
T* R = ( T * ) calloc( n * n, sizeof( T ) );
//
// At each iterations, matrices R and S grow by b_sz.
// At the end, size of R would by d x d and size of S would
// be (d + 1) x d, where d = numiters_complete * b_sz, d <= n.
// Note that the total amount of iterations will always be numiters <= n * 2 / block_size
T* R = ( T * ) calloc( n * n, sizeof( T ) );
T* S = ( T * ) calloc( (n + k) * n, sizeof( T ) );

T* Y_orth_buf = ( T * ) calloc( k * n, sizeof( T ) );
Expand Down Expand Up @@ -177,16 +182,19 @@ int RBKI<T, RNG>::call(

// Generate a dense Gaussian random matrx.
// OMP_NUM_THREADS=4 seems to be the best option for dense sketch generation.
omp_set_num_threads(4);
//omp_set_num_threads(4);
RandBLAS::DenseDist D(n, k);
state = RandBLAS::fill_dense(D, Y_i, state).second;
omp_set_num_threads(48);
//omp_set_num_threads(48);

if(this -> timing) {
sketching_t_stop = high_resolution_clock::now();
sketching_t_dur = duration_cast<microseconds>(sketching_t_stop - sketching_t_start).count();
gemm_A_t_start = high_resolution_clock::now();
}
printf("m %d, n %d, k %d\n", m, n, k);
char name[] = "A";
//RandBLAS::util::print_colmaj(n, k, Y_i, name);

// [X_ev, ~] = qr(A * Y_i, 0)
blas::gemm(Layout::ColMajor, Op::NoTrans, Op::NoTrans, m, k, n, 1.0, A, m, Y_i, n, 0.0, X_i, m);
Expand Down Expand Up @@ -222,12 +230,12 @@ int RBKI<T, RNG>::call(

// Iterate until in-loop termination criteria is met.

while((iter_ev + iter_od) < max_iters) {
while(1) {
if(this -> timing)
main_loop_t_start = high_resolution_clock::now();

if (iter % 2 != 0) {

printf("First\n");
if(this -> timing)
gemm_A_t_start = high_resolution_clock::now();

Expand Down Expand Up @@ -299,7 +307,7 @@ int RBKI<T, RNG>::call(
// Early termination
// if (abs(R(end)) <= sqrt(eps('double')))
if(std::abs(R_ii[(n + 1) * (k - 1)]) < std::sqrt(std::numeric_limits<double>::epsilon())) {
//printf("TERMINATION 1 at iteration %ld\n", iter_ev);
printf("TERMINATION 1 at iteration %ld\n", iter);
break;
}

Expand All @@ -311,7 +319,7 @@ int RBKI<T, RNG>::call(
++iter_ev;
}
else {

printf("Second\n");
if(this -> timing)
gemm_A_t_start = high_resolution_clock::now();

Expand Down Expand Up @@ -378,7 +386,7 @@ int RBKI<T, RNG>::call(
// Early termination
// if (abs(S(end)) <= sqrt(eps('double')))
if(std::abs(S_ii[((n + k) + 1) * (k - 1)]) < std::sqrt(std::numeric_limits<double>::epsilon())) {
//printf("TERMINATION 2 at iteration %ld\n", iter_od);
printf("TERMINATION 2 at iteration %ld\n", iter);
break;
}

Expand All @@ -403,17 +411,26 @@ int RBKI<T, RNG>::call(
main_loop_t_dur += duration_cast<microseconds>(main_loop_t_stop - main_loop_t_start).count();
}

if (iter >= max_iters) {
break;
}

++iter;
//norm(R, 'fro') > sqrt(1 - sq_tol) * norm_A
if(norm_R > threshold) {
printf("Threshold termination\n");
break;
}
printf("Iter_ev + iter_od %d\n", iter_ev + iter_od);
}
printf("Total iters %d\n", iter);

this -> norm_R_end = norm_R;
this->num_krylov_iters = iter;
iter % 2 == 0 ? end_rows = k * (iter_ev + 1), end_cols = k * iter_ev : end_rows = k * (iter_od + 1), end_cols = k * iter_od;

printf("End rows & cols %d, %d\n", end_rows, end_cols);

if(this -> timing) {
allocation_t_start = high_resolution_clock::now();
}
Expand All @@ -427,12 +444,24 @@ int RBKI<T, RNG>::call(
get_factors_t_start = high_resolution_clock::now();
}

if (iter % 2 == 0) {
if (iter % 2 != 0) {
printf("First option\n");
// [U_hat, Sigma, V_hat] = svd(R')
lapack::gesdd(Job::SomeVec, end_rows, end_cols, R, n, Sigma, U_hat, end_rows, VT_hat, end_cols);
} else {
printf("Second option\n");
// [U_hat, Sigma, V_hat] = svd(S)
lapack::gesdd(Job::SomeVec, end_rows, end_cols, S, n + k, Sigma, U_hat, end_rows, VT_hat, end_cols);
/*
char name1[] = "U_hat";
RandBLAS::util::print_colmaj(end_rows, end_cols, U_hat, name1);
char name3[] = "Sigma";
RandBLAS::util::print_colmaj(n, 1, Sigma, name3);
char name2[] = "V_hat";
RandBLAS::util::print_colmaj(end_cols, end_cols, VT_hat, name2);
*/
}
// U = X_ev * U_hat
blas::gemm(Layout::ColMajor, Op::NoTrans, Op::NoTrans, m, end_cols, end_rows, 1.0, X_ev, m, U_hat, end_rows, 0.0, U, m);
Expand Down Expand Up @@ -499,7 +528,6 @@ int RBKI<T, RNG>::call(
printf("/-------------RBKI TIMING RESULTS END-------------/\n\n");
}
}

return 0;
}
} // end namespace RandLAPACK
Expand Down
114 changes: 83 additions & 31 deletions benchmark/bench_RBKI/RBKI_speed_comparisons.cc
Original file line number Diff line number Diff line change
Expand Up @@ -13,18 +13,16 @@ struct RBKI_benchmark_data {
T tolerance;
std::vector<T> A;
std::vector<T> U;
std::vector<T> V;
std::vector<T> VT; // RBKI returns V'
std::vector<T> Sigma;
std::vector<T> Sigma_cpy_RBKI;
std::vector<T> Sigma_SVD;
std::vector<T> U_cpy;
std::vector<T> VT_cpy;

RBKI_benchmark_data(int64_t m, int64_t n, T tol) :
A(m * n, 0.0),
U(m * n, 0.0),
V(n * n, 0.0),
Sigma(n, 0.0),
Sigma_cpy_RBKI(n, 0.0),
Sigma_SVD(n, 0.0)
VT(n * n, 0.0),
Sigma(n, 0.0)
{
row = m;
col = n;
Expand All @@ -41,7 +39,7 @@ static void data_regen(RandLAPACK::gen::mat_gen_info<T> m_info,
if (overwrite_A)
RandLAPACK::gen::mat_gen<double, r123::Philox4x32>(m_info, all_data.A.data(), state);
std::fill(all_data.U.begin(), all_data.U.end(), 0.0);
std::fill(all_data.V.begin(), all_data.V.end(), 0.0);
std::fill(all_data.VT.begin(), all_data.VT.end(), 0.0);
std::fill(all_data.Sigma.begin(), all_data.Sigma.end(), 0.0);
}

Expand All @@ -54,22 +52,72 @@ static void update_best_time(int iter, long &t_best, long &t_curr, T* S1, T* S2,
}
}

// This routine computes the residual norm error, consisting of two parts (one of which) vanishes
// in exact precision. Target_rank defines size of U, V as returned by RBKI; custom_rank <= target_rank.
template <typename T>
static T
residual_error_comp(RBKI_benchmark_data<T> &all_data, int64_t target_rank, int64_t custom_rank) {

auto m = all_data.row;
auto n = all_data.col;

T* U_cpy_dat = RandLAPACK::util::upsize(m * target_rank, all_data.U_cpy);
T* VT_cpy_dat = RandLAPACK::util::upsize(n * target_rank, all_data.VT_cpy);

lapack::lacpy(MatrixType::General, m, target_rank, all_data.U.data(), m, U_cpy_dat, m);
lapack::lacpy(MatrixType::General, n, target_rank, all_data.VT.data(), n, VT_cpy_dat, n);

// AV - US
// Scale columns of U by S
for (int i = 0; i < target_rank; ++i)
blas::scal(n, all_data.Sigma[i], &U_cpy_dat[m * i], 1);

// Compute AV(:, 1:custom_rank) - SU(1:custom_rank)
blas::gemm(Layout::ColMajor, Op::NoTrans, Op::Trans, m, custom_rank, n, 1.0, all_data.A.data(), m, all_data.VT.data(), n, -1.0, U_cpy_dat, m);

// A'U - VS
// Scale columns of V by S
// Since we have VT, we will be scaling its rows

//char name[] = "V_cpy_pre";
//RandBLAS::util::print_colmaj(n, n, VT_cpy_dat, name);

for (int i = 0; i < n; ++i)
blas::scal(custom_rank, all_data.Sigma[i], &VT_cpy_dat[i], n);

//char name1[] = "V_cpy_post";
//RandBLAS::util::print_colmaj(n, n, VT_cpy_dat, name1);

// Compute A'U(:, 1:custom_rank) - VS(1:custom_rank).
// We will actually have to perform U' * A - Sigma * VT.
blas::gemm(Layout::ColMajor, Op::Trans, Op::NoTrans, target_rank, custom_rank, m, 1.0, all_data.U.data(), m, all_data.A.data(), m, -1.0, VT_cpy_dat, n);

//char name3[] = "A'U";
//RandBLAS::util::print_colmaj(n, n, VT_cpy_dat, name3);

T nrm1 = lapack::lange(Norm::Fro, m, custom_rank, U_cpy_dat, m) / std::sqrt(custom_rank);
T nrm2 = lapack::lange(Norm::Fro, target_rank, custom_rank, VT_cpy_dat, n) / std::sqrt(custom_rank);

return std::sqrt( std::pow(nrm2, 2) );
}

template <typename T, typename RNG>
static void call_all_algs(
RandLAPACK::gen::mat_gen_info<T> m_info,
int64_t numruns,
int64_t b_sz,
int64_t target_rank,
int64_t custom_rank,
RBKI_benchmark_data<T> &all_data,
RandBLAS::RNGState<RNG> &state,
std::string output_filename,
long dur_svd) {
printf("\nBlock size %ld, target rank %ld\n", b_sz, target_rank);

int i, j;
auto m = all_data.row;
auto n = all_data.col;
auto tol = all_data.tolerance;
auto m = all_data.row;
auto n = all_data.col;
auto tol = all_data.tolerance;
T norm_svd_k;
T err_rbki;
bool time_subroutines = false;
Expand All @@ -80,6 +128,7 @@ static void call_all_algs(
// These matrices will be full-rank.
// Hence, target_rank = b_sz * num_krylov_iters / 2
RBKI.max_krylov_iters = (int) ((target_rank * 2) / b_sz);
printf("Max Krylov iters %d\n", RBKI.max_krylov_iters);

// timing vars
long dur_rbki = 0;
Expand All @@ -88,29 +137,35 @@ static void call_all_algs(
// Making sure the states are unchanged
auto state_gen = state;

// Pre-compute the 2-norm of the Sigma vector from Direct SVD
norm_svd_k = blas::nrm2(target_rank, all_data.Sigma_SVD.data(), 1);

for (i = 0; i < numruns; ++i) {
printf("Iteration %d start.\n", i);

// Testing RBKI
auto start_rbki = high_resolution_clock::now();
RBKI.call(m, n, all_data.A.data(), m, b_sz, all_data.U.data(), all_data.V.data(), all_data.Sigma.data(), state);
RBKI.call(m, n, all_data.A.data(), m, b_sz, all_data.U.data(), all_data.VT.data(), all_data.Sigma.data(), state);
auto stop_rbki = high_resolution_clock::now();
dur_rbki = duration_cast<microseconds>(stop_rbki - start_rbki).count();

char name[] = "A";
//RandBLAS::util::print_colmaj(m, n, all_data.A.data(), name);

char name1[] = "U";
//RandBLAS::util::print_colmaj(m, target_rank, all_data.U.data(), name1);

char name3[] = "Sigma";
//RandBLAS::util::print_colmaj(target_rank, 1, all_data.Sigma.data(), name3);

char name2[] = "VT";
//RandBLAS::util::print_colmaj(n, n, all_data.VT.data(), name2);

for(j = 0; j < target_rank; ++j)
all_data.Sigma[j] -= all_data.Sigma_SVD[j];

err_rbki = blas::nrm2(target_rank, all_data.Sigma.data(), 1) / norm_svd_k;
T residual_err = residual_error_comp<T>(all_data, target_rank, custom_rank);

// Print accuracy info
printf("||Sigma_ksvd - Sigma_rbki||_F / ||Sigma_ksvd||_F: %.16e\n", err_rbki);
printf("RBKI is %f times faster that SVD.\n", (T) dur_svd / t_rbki_best);

printf("sqrt(||AV - SU||^2_F + ||A'U - VS||^2_F) / sqrt(traget_rank): %.16e\n", residual_err);

std::ofstream file(output_filename, std::ios::app);
file << b_sz << ", " << RBKI.max_krylov_iters << ", " << target_rank << ", " << err_rbki << ", " << dur_rbki << ", " << dur_svd << ",\n";
file << b_sz << ", " << RBKI.max_krylov_iters << ", " << target_rank << ", " << custom_rank << ", " << residual_err << ", " << dur_rbki << ", " << dur_svd << ",\n";

state_gen = state;
data_regen<T, RNG>(m_info, all_data, state_gen, 0);
Expand All @@ -132,11 +187,12 @@ int main(int argc, char *argv[]) {
int64_t b_sz_stop = 0;
int64_t target_rank_start = 512;
int64_t target_rank_curr = target_rank_start;
int64_t target_rank_stop = 4096;
int64_t target_rank_stop = 512;
int64_t custom_rank = 32;
double tol = std::pow(std::numeric_limits<double>::epsilon(), 0.85);
auto state = RandBLAS::RNGState();
auto state_constant = state;
int numruns = 5;
int numruns = 1;
long dur_svd = 0;
std::vector<long> res;

Expand All @@ -150,18 +206,14 @@ int main(int argc, char *argv[]) {
// Update basic params.
m = m_info.rows;
n = m_info.cols;
b_sz_start = 8;//std::max((int64_t) 1, n / 10);
b_sz_stop = 128;//std::max((int64_t) 1, n / 10);
b_sz_start = 16;//std::max((int64_t) 1, n / 10);
b_sz_stop = 16;//std::max((int64_t) 1, n / 10);

// Allocate basic workspace.
RBKI_benchmark_data<double> all_data(m, n, tol);

// Fill the data matrix;
RandLAPACK::gen::mat_gen<double, r123::Philox4x32>(m_info, all_data.A.data(), state);
// Read the singular vectors from argv2
int64_t buf1 = 1;
int buf2 = 0;
RandLAPACK::gen::process_input_mat(m, buf1, all_data.Sigma_SVD.data(), argv[2], buf2);

printf("Finished data preparation\n");

Expand All @@ -176,7 +228,7 @@ int main(int argc, char *argv[]) {

for (;b_sz_start <= b_sz_stop; b_sz_start *=2) {
for (;target_rank_curr <= target_rank_stop; target_rank_curr *=2) {
call_all_algs<double, r123::Philox4x32>(m_info, numruns, b_sz_start, target_rank_curr, all_data, state_constant, output_filename, dur_svd);
call_all_algs<double, r123::Philox4x32>(m_info, numruns, b_sz_start, target_rank_curr, custom_rank, all_data, state_constant, output_filename, dur_svd);
}
target_rank_curr = target_rank_start;
}
Expand Down

0 comments on commit d8a1fd2

Please sign in to comment.