Skip to content

Commit

Permalink
Update
Browse files Browse the repository at this point in the history
  • Loading branch information
TeachRaccooon committed Nov 17, 2023
1 parent 4c4df2e commit 5400b11
Show file tree
Hide file tree
Showing 4 changed files with 616 additions and 0 deletions.
191 changes: 191 additions & 0 deletions benchmark/bench_CQRRP/CQRRP_pivot_quality.cc
Original file line number Diff line number Diff line change
@@ -0,0 +1,191 @@
#include "RandLAPACK.hh"
#include "rl_blaspp.hh"
#include "rl_lapackpp.hh"
#include "rl_gen.hh"

#include <RandBLAS.hh>
#include <fstream>

template <typename T>
struct QR_speed_benchmark_data {
int64_t row;
int64_t col;
T tolerance;
T sampling_factor;
std::vector<T> A;
std::vector<T> tau;
std::vector<int64_t> J;
std::vector<T> S;

QR_speed_benchmark_data(int64_t m, int64_t n, T tol, T d_factor) :
A(m * n, 0.0),
tau(n, 0.0),
J(n, 0),
S(n, 0.0)
{
row = m;
col = n;
tolerance = tol;
sampling_factor = d_factor;
}
};

// Re-generate and clear data
template <typename T, typename RNG>
static void data_regen(RandLAPACK::gen::mat_gen_info<T> m_info,
QR_speed_benchmark_data<T> &all_data,
RandBLAS::RNGState<RNG> &state) {

RandLAPACK::gen::mat_gen<double, r123::Philox4x32>(m_info, all_data.A, state);
std::fill(all_data.tau.begin(), all_data.tau.end(), 0.0);
std::fill(all_data.J.begin(), all_data.J.end(), 0);
}

// Re-generate and clear data
template <typename T, typename RNG>
static std::vector<T> get_norms( QR_speed_benchmark_data<T> &all_data) {

int64_t m = all_data.row;
int64_t n = all_data.col;

std::vector<T> R_norms (n, 0.0);
for (int i = 0; i < n; ++i) {
R_norms[i] = lapack::lantr(Norm::Fro, Uplo::Upper, Diag::NonUnit, n - i, n - i, &all_data.A.data()[(m + 1) * i], m);
if (i < 10)
printf("%e\n", R_norms[i]);
}
return R_norms;
}

template <typename T, typename RNG>
static void R_norm_ratio(
RandLAPACK::gen::mat_gen_info<T> m_info,
int64_t b_sz,
QR_speed_benchmark_data<T> &all_data,
RandBLAS::RNGState<RNG> &state) {

auto m = all_data.row;
auto n = all_data.col;
auto tol = all_data.tolerance;
auto d_factor = all_data.sampling_factor;

// Additional params setup.
RandLAPACK::CQRRP_blocked<double, r123::Philox4x32> CQRRP_blocked(false, tol, b_sz);
CQRRP_blocked.nnz = 2;
CQRRP_blocked.num_threads = 8;

// Running HQRRP
std::iota(all_data.J.begin(), all_data.J.end(), 1);
//RandLAPACK::hqrrp(m, n, all_data.A.data(), m, all_data.J.data(), all_data.tau.data(), b_sz, (d_factor - 1) * b_sz, 0, 0, state, (T*) nullptr);
lapack::geqp3(m, n, all_data.A.data(), m, all_data.J.data(), all_data.tau.data());
std::vector<T> R_norms_HQRRP = get_norms<T, RNG>(all_data);
printf("\nDone with HQRRP\n");

// Clear and re-generate data
data_regen<T, RNG>(m_info, all_data, state);

printf("\nStarting CQRRP\n");
// Running CQRRP
CQRRP_blocked.call(m, n, all_data.A.data(), m, d_factor, all_data.tau.data(), all_data.J.data(), state);
std::vector<T> R_norms_CQRRP = get_norms<T, RNG>(all_data);

// Declare a data file
std::fstream file1("data_out/QR_R_norm_ratios_rows_" + std::to_string(m)
+ "_cols_" + std::to_string(n)
+ "_b_sz_" + std::to_string(b_sz)
+ "_d_factor_" + std::to_string(d_factor)
+ ".dat", std::fstream::app);

// Write the 1st metric info into a file.
for (int i = 0; i < n; ++i)
file1 << R_norms_HQRRP[i] / R_norms_CQRRP[i] << ", ";
}

template <typename T, typename RNG>
static void sv_ratio(
RandLAPACK::gen::mat_gen_info<T> m_info,
int64_t b_sz,
QR_speed_benchmark_data<T> &all_data,
RandBLAS::RNGState<RNG> &state) {

auto m = all_data.row;
auto n = all_data.col;
auto tol = all_data.tolerance;
auto d_factor = all_data.sampling_factor;
std::vector<T> geqp3 (n, 0.0);
std::vector<T> sv_ratios_cqrrp (n, 0.0);

auto state1 = state;

// Additional params setup.
RandLAPACK::CQRRP_blocked<double, r123::Philox4x32> CQRRP_blocked(false, tol, b_sz);
CQRRP_blocked.nnz = 2;
CQRRP_blocked.num_threads = 8;

std::fstream file2("data_out/QR_sv_ratios_rows_" + std::to_string(m)
+ "_cols_" + std::to_string(n)
+ "_b_sz_" + std::to_string(b_sz)
+ "_d_factor_" + std::to_string(d_factor)
+ ".dat", std::fstream::app);

T* R_dat = all_data.A.data();
T* S_dat = all_data.S.data();

// Running SVD
lapack::gesdd(Job::NoVec, m, n, all_data.A.data(), m, all_data.S.data(), (T*) nullptr, m, (T*) nullptr, n);

// Clear and re-generate data
data_regen<T, RNG>(m_info, all_data, state);

// Running GEQP3
std::iota(all_data.J.begin(), all_data.J.end(), 1);
//RandLAPACK::hqrrp(m, n, all_data.A.data(), m, all_data.J.data(), all_data.tau.data(), b_sz, (d_factor - 1) * b_sz, 0, 0, state, (T*) nullptr);
lapack::geqp3(m, n, all_data.A.data(), m, all_data.J.data(), all_data.tau.data());

// Write the 2nd metric info into a file.
for (int i = 0; i < n; ++i)
file2 << std::abs(R_dat[(m + 1) * i] / S_dat[i]) << ", ";

file2 << ",\n";

// Clear and re-generate data
data_regen<T, RNG>(m_info, all_data, state1);

// Running CQRRP
CQRRP_blocked.call(m, n, all_data.A.data(), m, d_factor, all_data.tau.data(), all_data.J.data(), state);

// Write the 2nd metric info into a file.
for (int i = 0; i < n; ++i)
file2 << std::abs(R_dat[(m + 1) * i] / S_dat[i]) << ", ";
}

int main() {
// Declare parameters
int64_t m = std::pow(2, 12);
int64_t n = std::pow(2, 12);
double d_factor = 1.125;
int64_t b_sz = 256;
double tol = std::pow(std::numeric_limits<double>::epsilon(), 0.85);
auto state = RandBLAS::RNGState();
auto state_constant1 = state;
auto state_constant2 = state;
// results
std::vector<double> res1;
std::vector<double> res2;

// Allocate basic workspace
QR_speed_benchmark_data<double> all_data(m, n, tol, d_factor);
// Generate the input matrix - gaussian suffices for performance tests.
RandLAPACK::gen::mat_gen_info<double> m_info(m, n, RandLAPACK::gen::kahan);
//m_info.cond_num = std::pow(10, 10);
//m_info.rank = n;
//m_info.exponent = 2.0;
RandLAPACK::gen::mat_gen<double, r123::Philox4x32>(m_info, all_data.A, state);

#if !defined(__APPLE__)
R_norm_ratio<double, r123::Philox4x32>(m_info, b_sz, all_data, state_constant1);
printf("R done\n");
sv_ratio<double, r123::Philox4x32>(m_info, b_sz, all_data, state_constant2);
printf("SV done\n\n");
#endif
}
125 changes: 125 additions & 0 deletions benchmark/bench_CQRRP/CQRRP_runtime_breakdown.cc
Original file line number Diff line number Diff line change
@@ -0,0 +1,125 @@
#include "RandLAPACK.hh"
#include "rl_blaspp.hh"
#include "rl_lapackpp.hh"
#include "rl_gen.hh"

#include <RandBLAS.hh>
#include <fstream>

template <typename T>
struct QR_speed_benchmark_data {
int64_t row;
int64_t col;
T tolerance;
T sampling_factor;
std::vector<T> A;
std::vector<T> tau;
std::vector<int64_t> J;

QR_speed_benchmark_data(int64_t m, int64_t n, T tol, T d_factor) :
A(m * n, 0.0),
tau(n, 0.0),
J(n, 0)
{
row = m;
col = n;
tolerance = tol;
sampling_factor = d_factor;
}
};

// Re-generate and clear data
template <typename T, typename RNG>
static void data_regen(RandLAPACK::gen::mat_gen_info<T> m_info,
QR_speed_benchmark_data<T> &all_data,
RandBLAS::RNGState<RNG> &state) {

RandLAPACK::gen::mat_gen<double, r123::Philox4x32>(m_info, all_data.A, state);
std::fill(all_data.tau.begin(), all_data.tau.end(), 0.0);
std::fill(all_data.J.begin(), all_data.J.end(), 0);
}

template <typename T, typename RNG>
static std::vector<long> call_all_algs(
RandLAPACK::gen::mat_gen_info<T> m_info,
int64_t numruns,
int64_t b_sz,
QR_speed_benchmark_data<T> &all_data,
RandBLAS::RNGState<RNG> &state) {

auto m = all_data.row;
auto n = all_data.col;
auto tol = all_data.tolerance;
auto d_factor = all_data.sampling_factor;

// Additional params setup.
RandLAPACK::CQRRP_blocked<double, r123::Philox4x32> CQRRP_blocked(true, tol, b_sz);
CQRRP_blocked.nnz = 2;
CQRRP_blocked.num_threads = 8;

// Making sure the states are unchanged
auto state_gen_0 = state;
auto state_alg_0 = state;

// Timing vars
long dur_cqrrp = 0;
long t_cqrrp_best = 0;
std::vector<long> inner_timing_best;

for (int i = 0; i < numruns; ++i) {
printf("ITERATION\n");

// Testing CQRRP - best setuo
auto start_cqrrp = high_resolution_clock::now();
CQRRP_blocked.call(m, n, all_data.A.data(), m, d_factor, all_data.tau.data(), all_data.J.data(), state_gen_0);
auto stop_cqrrp = high_resolution_clock::now();
dur_cqrrp = duration_cast<microseconds>(stop_cqrrp - start_cqrrp).count();
// Update best timing
if (!i || dur_cqrrp < t_cqrrp_best) {t_cqrrp_best = dur_cqrrp; inner_timing_best = CQRRP_blocked.times;}

// Making sure the states are unchanged
state_gen_0 = state;
state_alg_0 = state;
// Clear and re-generate data
data_regen<T, RNG>(m_info, all_data, state_gen_0);
}

return inner_timing_best;
}

int main() {
// Declare parameters
int64_t m = std::pow(2, 14);
int64_t n = std::pow(2, 14);
double d_factor = 1.125;
int64_t b_sz_start = 256;
int64_t b_sz_end = 2048;
double tol = std::pow(std::numeric_limits<double>::epsilon(), 0.85);
auto state = RandBLAS::RNGState();
auto state_constant = state;
// Timing results
std::vector<long> res;
// Number of algorithm runs. We only record best times.
int64_t numruns = 5;

// Allocate basic workspace
QR_speed_benchmark_data<double> all_data(m, n, tol, d_factor);
// Generate the input matrix - gaussian suffices for performance tests.
RandLAPACK::gen::mat_gen_info<double> m_info(m, n, RandLAPACK::gen::gaussian);
RandLAPACK::gen::mat_gen<double, r123::Philox4x32>(m_info, all_data.A, state);

// Declare a data file
std::fstream file("CQRRP_inner_speed_" + std::to_string(m)
+ "_cols_" + std::to_string(n)
+ "_b_sz_start_" + std::to_string(b_sz_start)
+ "_b_sz_end_" + std::to_string(b_sz_end)
+ "_d_factor_" + std::to_string(d_factor)
+ ".dat", std::fstream::app);

#if !defined(__APPLE__)
for (;b_sz_start <= b_sz_end; b_sz_start *= 2) {
res = call_all_algs<double, r123::Philox4x32>(m_info, numruns, b_sz_start, all_data, state_constant);
file << res[0] << ", " << res[1] << ", " << res[2] << ", " << res[3] << ", " << res[4] << ", " << res[5] << ", " << res[6] << ", " << res[7] << ", " << res[8] << ", " << res[9] << ", " << res[10] << ", " << res[11] << ",\n";
}
#endif
}
Loading

0 comments on commit 5400b11

Please sign in to comment.