-
Notifications
You must be signed in to change notification settings - Fork 6
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
1 parent
4c4df2e
commit 5400b11
Showing
4 changed files
with
616 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,191 @@ | ||
#include "RandLAPACK.hh" | ||
#include "rl_blaspp.hh" | ||
#include "rl_lapackpp.hh" | ||
#include "rl_gen.hh" | ||
|
||
#include <RandBLAS.hh> | ||
#include <fstream> | ||
|
||
template <typename T> | ||
struct QR_speed_benchmark_data { | ||
int64_t row; | ||
int64_t col; | ||
T tolerance; | ||
T sampling_factor; | ||
std::vector<T> A; | ||
std::vector<T> tau; | ||
std::vector<int64_t> J; | ||
std::vector<T> S; | ||
|
||
QR_speed_benchmark_data(int64_t m, int64_t n, T tol, T d_factor) : | ||
A(m * n, 0.0), | ||
tau(n, 0.0), | ||
J(n, 0), | ||
S(n, 0.0) | ||
{ | ||
row = m; | ||
col = n; | ||
tolerance = tol; | ||
sampling_factor = d_factor; | ||
} | ||
}; | ||
|
||
// Re-generate and clear data | ||
template <typename T, typename RNG> | ||
static void data_regen(RandLAPACK::gen::mat_gen_info<T> m_info, | ||
QR_speed_benchmark_data<T> &all_data, | ||
RandBLAS::RNGState<RNG> &state) { | ||
|
||
RandLAPACK::gen::mat_gen<double, r123::Philox4x32>(m_info, all_data.A, state); | ||
std::fill(all_data.tau.begin(), all_data.tau.end(), 0.0); | ||
std::fill(all_data.J.begin(), all_data.J.end(), 0); | ||
} | ||
|
||
// Re-generate and clear data | ||
template <typename T, typename RNG> | ||
static std::vector<T> get_norms( QR_speed_benchmark_data<T> &all_data) { | ||
|
||
int64_t m = all_data.row; | ||
int64_t n = all_data.col; | ||
|
||
std::vector<T> R_norms (n, 0.0); | ||
for (int i = 0; i < n; ++i) { | ||
R_norms[i] = lapack::lantr(Norm::Fro, Uplo::Upper, Diag::NonUnit, n - i, n - i, &all_data.A.data()[(m + 1) * i], m); | ||
if (i < 10) | ||
printf("%e\n", R_norms[i]); | ||
} | ||
return R_norms; | ||
} | ||
|
||
template <typename T, typename RNG> | ||
static void R_norm_ratio( | ||
RandLAPACK::gen::mat_gen_info<T> m_info, | ||
int64_t b_sz, | ||
QR_speed_benchmark_data<T> &all_data, | ||
RandBLAS::RNGState<RNG> &state) { | ||
|
||
auto m = all_data.row; | ||
auto n = all_data.col; | ||
auto tol = all_data.tolerance; | ||
auto d_factor = all_data.sampling_factor; | ||
|
||
// Additional params setup. | ||
RandLAPACK::CQRRP_blocked<double, r123::Philox4x32> CQRRP_blocked(false, tol, b_sz); | ||
CQRRP_blocked.nnz = 2; | ||
CQRRP_blocked.num_threads = 8; | ||
|
||
// Running HQRRP | ||
std::iota(all_data.J.begin(), all_data.J.end(), 1); | ||
//RandLAPACK::hqrrp(m, n, all_data.A.data(), m, all_data.J.data(), all_data.tau.data(), b_sz, (d_factor - 1) * b_sz, 0, 0, state, (T*) nullptr); | ||
lapack::geqp3(m, n, all_data.A.data(), m, all_data.J.data(), all_data.tau.data()); | ||
std::vector<T> R_norms_HQRRP = get_norms<T, RNG>(all_data); | ||
printf("\nDone with HQRRP\n"); | ||
|
||
// Clear and re-generate data | ||
data_regen<T, RNG>(m_info, all_data, state); | ||
|
||
printf("\nStarting CQRRP\n"); | ||
// Running CQRRP | ||
CQRRP_blocked.call(m, n, all_data.A.data(), m, d_factor, all_data.tau.data(), all_data.J.data(), state); | ||
std::vector<T> R_norms_CQRRP = get_norms<T, RNG>(all_data); | ||
|
||
// Declare a data file | ||
std::fstream file1("data_out/QR_R_norm_ratios_rows_" + std::to_string(m) | ||
+ "_cols_" + std::to_string(n) | ||
+ "_b_sz_" + std::to_string(b_sz) | ||
+ "_d_factor_" + std::to_string(d_factor) | ||
+ ".dat", std::fstream::app); | ||
|
||
// Write the 1st metric info into a file. | ||
for (int i = 0; i < n; ++i) | ||
file1 << R_norms_HQRRP[i] / R_norms_CQRRP[i] << ", "; | ||
} | ||
|
||
template <typename T, typename RNG> | ||
static void sv_ratio( | ||
RandLAPACK::gen::mat_gen_info<T> m_info, | ||
int64_t b_sz, | ||
QR_speed_benchmark_data<T> &all_data, | ||
RandBLAS::RNGState<RNG> &state) { | ||
|
||
auto m = all_data.row; | ||
auto n = all_data.col; | ||
auto tol = all_data.tolerance; | ||
auto d_factor = all_data.sampling_factor; | ||
std::vector<T> geqp3 (n, 0.0); | ||
std::vector<T> sv_ratios_cqrrp (n, 0.0); | ||
|
||
auto state1 = state; | ||
|
||
// Additional params setup. | ||
RandLAPACK::CQRRP_blocked<double, r123::Philox4x32> CQRRP_blocked(false, tol, b_sz); | ||
CQRRP_blocked.nnz = 2; | ||
CQRRP_blocked.num_threads = 8; | ||
|
||
std::fstream file2("data_out/QR_sv_ratios_rows_" + std::to_string(m) | ||
+ "_cols_" + std::to_string(n) | ||
+ "_b_sz_" + std::to_string(b_sz) | ||
+ "_d_factor_" + std::to_string(d_factor) | ||
+ ".dat", std::fstream::app); | ||
|
||
T* R_dat = all_data.A.data(); | ||
T* S_dat = all_data.S.data(); | ||
|
||
// Running SVD | ||
lapack::gesdd(Job::NoVec, m, n, all_data.A.data(), m, all_data.S.data(), (T*) nullptr, m, (T*) nullptr, n); | ||
|
||
// Clear and re-generate data | ||
data_regen<T, RNG>(m_info, all_data, state); | ||
|
||
// Running GEQP3 | ||
std::iota(all_data.J.begin(), all_data.J.end(), 1); | ||
//RandLAPACK::hqrrp(m, n, all_data.A.data(), m, all_data.J.data(), all_data.tau.data(), b_sz, (d_factor - 1) * b_sz, 0, 0, state, (T*) nullptr); | ||
lapack::geqp3(m, n, all_data.A.data(), m, all_data.J.data(), all_data.tau.data()); | ||
|
||
// Write the 2nd metric info into a file. | ||
for (int i = 0; i < n; ++i) | ||
file2 << std::abs(R_dat[(m + 1) * i] / S_dat[i]) << ", "; | ||
|
||
file2 << ",\n"; | ||
|
||
// Clear and re-generate data | ||
data_regen<T, RNG>(m_info, all_data, state1); | ||
|
||
// Running CQRRP | ||
CQRRP_blocked.call(m, n, all_data.A.data(), m, d_factor, all_data.tau.data(), all_data.J.data(), state); | ||
|
||
// Write the 2nd metric info into a file. | ||
for (int i = 0; i < n; ++i) | ||
file2 << std::abs(R_dat[(m + 1) * i] / S_dat[i]) << ", "; | ||
} | ||
|
||
int main() { | ||
// Declare parameters | ||
int64_t m = std::pow(2, 12); | ||
int64_t n = std::pow(2, 12); | ||
double d_factor = 1.125; | ||
int64_t b_sz = 256; | ||
double tol = std::pow(std::numeric_limits<double>::epsilon(), 0.85); | ||
auto state = RandBLAS::RNGState(); | ||
auto state_constant1 = state; | ||
auto state_constant2 = state; | ||
// results | ||
std::vector<double> res1; | ||
std::vector<double> res2; | ||
|
||
// Allocate basic workspace | ||
QR_speed_benchmark_data<double> all_data(m, n, tol, d_factor); | ||
// Generate the input matrix - gaussian suffices for performance tests. | ||
RandLAPACK::gen::mat_gen_info<double> m_info(m, n, RandLAPACK::gen::kahan); | ||
//m_info.cond_num = std::pow(10, 10); | ||
//m_info.rank = n; | ||
//m_info.exponent = 2.0; | ||
RandLAPACK::gen::mat_gen<double, r123::Philox4x32>(m_info, all_data.A, state); | ||
|
||
#if !defined(__APPLE__) | ||
R_norm_ratio<double, r123::Philox4x32>(m_info, b_sz, all_data, state_constant1); | ||
printf("R done\n"); | ||
sv_ratio<double, r123::Philox4x32>(m_info, b_sz, all_data, state_constant2); | ||
printf("SV done\n\n"); | ||
#endif | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,125 @@ | ||
#include "RandLAPACK.hh" | ||
#include "rl_blaspp.hh" | ||
#include "rl_lapackpp.hh" | ||
#include "rl_gen.hh" | ||
|
||
#include <RandBLAS.hh> | ||
#include <fstream> | ||
|
||
template <typename T> | ||
struct QR_speed_benchmark_data { | ||
int64_t row; | ||
int64_t col; | ||
T tolerance; | ||
T sampling_factor; | ||
std::vector<T> A; | ||
std::vector<T> tau; | ||
std::vector<int64_t> J; | ||
|
||
QR_speed_benchmark_data(int64_t m, int64_t n, T tol, T d_factor) : | ||
A(m * n, 0.0), | ||
tau(n, 0.0), | ||
J(n, 0) | ||
{ | ||
row = m; | ||
col = n; | ||
tolerance = tol; | ||
sampling_factor = d_factor; | ||
} | ||
}; | ||
|
||
// Re-generate and clear data | ||
template <typename T, typename RNG> | ||
static void data_regen(RandLAPACK::gen::mat_gen_info<T> m_info, | ||
QR_speed_benchmark_data<T> &all_data, | ||
RandBLAS::RNGState<RNG> &state) { | ||
|
||
RandLAPACK::gen::mat_gen<double, r123::Philox4x32>(m_info, all_data.A, state); | ||
std::fill(all_data.tau.begin(), all_data.tau.end(), 0.0); | ||
std::fill(all_data.J.begin(), all_data.J.end(), 0); | ||
} | ||
|
||
template <typename T, typename RNG> | ||
static std::vector<long> call_all_algs( | ||
RandLAPACK::gen::mat_gen_info<T> m_info, | ||
int64_t numruns, | ||
int64_t b_sz, | ||
QR_speed_benchmark_data<T> &all_data, | ||
RandBLAS::RNGState<RNG> &state) { | ||
|
||
auto m = all_data.row; | ||
auto n = all_data.col; | ||
auto tol = all_data.tolerance; | ||
auto d_factor = all_data.sampling_factor; | ||
|
||
// Additional params setup. | ||
RandLAPACK::CQRRP_blocked<double, r123::Philox4x32> CQRRP_blocked(true, tol, b_sz); | ||
CQRRP_blocked.nnz = 2; | ||
CQRRP_blocked.num_threads = 8; | ||
|
||
// Making sure the states are unchanged | ||
auto state_gen_0 = state; | ||
auto state_alg_0 = state; | ||
|
||
// Timing vars | ||
long dur_cqrrp = 0; | ||
long t_cqrrp_best = 0; | ||
std::vector<long> inner_timing_best; | ||
|
||
for (int i = 0; i < numruns; ++i) { | ||
printf("ITERATION\n"); | ||
|
||
// Testing CQRRP - best setuo | ||
auto start_cqrrp = high_resolution_clock::now(); | ||
CQRRP_blocked.call(m, n, all_data.A.data(), m, d_factor, all_data.tau.data(), all_data.J.data(), state_gen_0); | ||
auto stop_cqrrp = high_resolution_clock::now(); | ||
dur_cqrrp = duration_cast<microseconds>(stop_cqrrp - start_cqrrp).count(); | ||
// Update best timing | ||
if (!i || dur_cqrrp < t_cqrrp_best) {t_cqrrp_best = dur_cqrrp; inner_timing_best = CQRRP_blocked.times;} | ||
|
||
// Making sure the states are unchanged | ||
state_gen_0 = state; | ||
state_alg_0 = state; | ||
// Clear and re-generate data | ||
data_regen<T, RNG>(m_info, all_data, state_gen_0); | ||
} | ||
|
||
return inner_timing_best; | ||
} | ||
|
||
int main() { | ||
// Declare parameters | ||
int64_t m = std::pow(2, 14); | ||
int64_t n = std::pow(2, 14); | ||
double d_factor = 1.125; | ||
int64_t b_sz_start = 256; | ||
int64_t b_sz_end = 2048; | ||
double tol = std::pow(std::numeric_limits<double>::epsilon(), 0.85); | ||
auto state = RandBLAS::RNGState(); | ||
auto state_constant = state; | ||
// Timing results | ||
std::vector<long> res; | ||
// Number of algorithm runs. We only record best times. | ||
int64_t numruns = 5; | ||
|
||
// Allocate basic workspace | ||
QR_speed_benchmark_data<double> all_data(m, n, tol, d_factor); | ||
// Generate the input matrix - gaussian suffices for performance tests. | ||
RandLAPACK::gen::mat_gen_info<double> m_info(m, n, RandLAPACK::gen::gaussian); | ||
RandLAPACK::gen::mat_gen<double, r123::Philox4x32>(m_info, all_data.A, state); | ||
|
||
// Declare a data file | ||
std::fstream file("CQRRP_inner_speed_" + std::to_string(m) | ||
+ "_cols_" + std::to_string(n) | ||
+ "_b_sz_start_" + std::to_string(b_sz_start) | ||
+ "_b_sz_end_" + std::to_string(b_sz_end) | ||
+ "_d_factor_" + std::to_string(d_factor) | ||
+ ".dat", std::fstream::app); | ||
|
||
#if !defined(__APPLE__) | ||
for (;b_sz_start <= b_sz_end; b_sz_start *= 2) { | ||
res = call_all_algs<double, r123::Philox4x32>(m_info, numruns, b_sz_start, all_data, state_constant); | ||
file << res[0] << ", " << res[1] << ", " << res[2] << ", " << res[3] << ", " << res[4] << ", " << res[5] << ", " << res[6] << ", " << res[7] << ", " << res[8] << ", " << res[9] << ", " << res[10] << ", " << res[11] << ",\n"; | ||
} | ||
#endif | ||
} |
Oops, something went wrong.