Skip to content

Commit

Permalink
Thohough HQRRP profiling
Browse files Browse the repository at this point in the history
  • Loading branch information
TeachRaccooon committed Apr 29, 2024
1 parent 4c25676 commit f0adc77
Show file tree
Hide file tree
Showing 2 changed files with 134 additions and 25 deletions.
141 changes: 125 additions & 16 deletions RandLAPACK/drivers/rl_hqrrp.hh
Original file line number Diff line number Diff line change
Expand Up @@ -580,7 +580,7 @@ int64_t NoFLA_QRPmod_WY_unb_var4(
int64_t * buff_p, T * buff_t,
int64_t pivot_B, int64_t m_B, T * buff_B, int64_t ldim_B,
int64_t pivot_C, int64_t m_C, T * buff_C, int64_t ldim_C,
int64_t build_T, T * buff_T, int64_t ldim_T, T* buff_R, int64_t ldim_R, T* buff_D) {
int64_t build_T, T * buff_T, int64_t ldim_T, T* buff_R, int64_t ldim_R, T* buff_D, T* timing) {
//
// "pivoting": If pivoting==1, then QR factorization with pivoting is used.
//
Expand All @@ -607,6 +607,36 @@ int64_t NoFLA_QRPmod_WY_unb_var4(
}
*/

high_resolution_clock::time_point preallocation_t_start;
high_resolution_clock::time_point preallocation_t_stop;
high_resolution_clock::time_point norms_t_start;
high_resolution_clock::time_point norms_t_stop;
high_resolution_clock::time_point pivoting_t_start;
high_resolution_clock::time_point pivoting_t_stop;
high_resolution_clock::time_point gen_reflector_1_t_start;
high_resolution_clock::time_point gen_reflector_1_t_stop;
high_resolution_clock::time_point gen_reflector_2_t_start;
high_resolution_clock::time_point gen_reflector_2_t_stop;
high_resolution_clock::time_point downdating_t_start;
high_resolution_clock::time_point downdating_t_stop;
high_resolution_clock::time_point gen_T_t_start;
high_resolution_clock::time_point gen_T_t_stop;
high_resolution_clock::time_point total_t_start;
high_resolution_clock::time_point total_t_stop;
long preallocation_t_dur = 0;
long norms_t_dur = 0;
long pivoting_t_dur = 0;
long gen_reflector_1_t_dur = 0;
long gen_reflector_2_t_dur = 0;
long downdating_t_dur = 0;
long gen_T_t_dur = 0;
long total_t_dur = 0;

if(timing != nullptr) {
total_t_start = high_resolution_clock::now();
preallocation_t_start = high_resolution_clock::now();
}

int64_t j, mn_A, m_a21, m_A22, n_A22, n_dB, idx_max_col,
i_one = 1, n_house_vector, m_rest;
T * buff_d, * buff_e, * buff_workspace, diag;
Expand All @@ -624,9 +654,20 @@ int64_t NoFLA_QRPmod_WY_unb_var4(
buff_e = ( T * ) calloc( n_A, sizeof( T ) );
buff_workspace = ( T * ) calloc( n_A, sizeof( T ) );

if(timing != nullptr) {
preallocation_t_stop = high_resolution_clock::now();
preallocation_t_dur = duration_cast<microseconds>(preallocation_t_stop - preallocation_t_start).count();
norms_t_start = high_resolution_clock::now();
}

// Compute initial norms of A int64_to d and e.
NoFLA_QRP_compute_norms( m_A, n_A, buff_A, ldim_A, buff_d, buff_e );

if(timing != nullptr) {
norms_t_stop = high_resolution_clock::now();
norms_t_dur = duration_cast<microseconds>(norms_t_stop - norms_t_start).count();
}

// Main Loop.
for( j = 0; j < num_stages; j++ ) {

Expand All @@ -638,6 +679,10 @@ int64_t NoFLA_QRPmod_WY_unb_var4(
// Obtain the index of the column with largest 2-norm.
idx_max_col = blas::iamax( n_dB, & buff_d[ j ], i_one ); // - 1;

if(timing != nullptr) {
pivoting_t_start = high_resolution_clock::now();
}

// Swap columns of A, B, C, pivots, and norms vectors.
NoFLA_QRP_pivot_G_B_C( idx_max_col,
m_A, & buff_A[ 0 + j * ldim_A ], ldim_A,
Expand All @@ -647,6 +692,12 @@ int64_t NoFLA_QRPmod_WY_unb_var4(
& buff_d[ j ],
& buff_e[ j ] );

if(timing != nullptr) {
pivoting_t_stop = high_resolution_clock::now();
pivoting_t_dur += duration_cast<microseconds>(pivoting_t_stop - pivoting_t_start).count();
gen_reflector_1_t_start = high_resolution_clock::now();
}

// Compute tau1 and u21 from alpha11 and a21 such that tau1 and u21
// determine a Householder transform H such that applying H from the
// left to the column vector consisting of alpha11 and a21 annihilates
Expand All @@ -659,6 +710,12 @@ int64_t NoFLA_QRPmod_WY_unb_var4(
& buff_t[j]
);

if(timing != nullptr) {
gen_reflector_1_t_stop = high_resolution_clock::now();
gen_reflector_1_t_dur += duration_cast<microseconds>(gen_reflector_1_t_stop - gen_reflector_1_t_start).count();
gen_reflector_2_t_start = high_resolution_clock::now();
}

// | a12t | = H | a12t |
// | A22 | | A22 |
//
Expand All @@ -674,12 +731,27 @@ int64_t NoFLA_QRPmod_WY_unb_var4(
);
buff_A[ j + j * ldim_A ] = diag;

if(timing != nullptr) {
gen_reflector_2_t_stop = high_resolution_clock::now();
gen_reflector_2_t_dur += duration_cast<microseconds>(gen_reflector_2_t_stop - gen_reflector_2_t_start).count();
downdating_t_start = high_resolution_clock::now();
}

// Update partial column norms.
NoFLA_QRP_downdate_partial_norms( m_A22, n_A22,
& buff_d[ j+1 ], 1,
& buff_e[ j+1 ], 1,
& buff_A[ j + ( j+1 ) * ldim_A ], ldim_A,
& buff_A[ ( j+1 ) + std::min( n_A-1, ( j+1 ) ) * ldim_A ], ldim_A );

if(timing != nullptr) {
downdating_t_stop = high_resolution_clock::now();
downdating_t_dur += duration_cast<microseconds>(downdating_t_stop - downdating_t_start).count();
}
}

if(timing != nullptr) {
gen_T_t_start = high_resolution_clock::now();
}

// Build T.
Expand All @@ -690,6 +762,30 @@ int64_t NoFLA_QRPmod_WY_unb_var4(
buff_t, buff_T, ldim_T);
}

if(timing != nullptr) {
gen_T_t_stop = high_resolution_clock::now();
gen_T_t_dur = duration_cast<microseconds>(gen_T_t_stop - gen_T_t_start).count();
}

if(timing != nullptr) {
total_t_stop = high_resolution_clock::now();
total_t_dur = duration_cast<microseconds>(total_t_stop - total_t_start).count();
long other_t_dur = total_t_dur - (preallocation_t_dur + norms_t_dur + pivoting_t_dur + gen_reflector_1_t_dur + gen_reflector_2_t_dur + downdating_t_dur + gen_T_t_dur);

timing[0] += (T) preallocation_t_dur;
timing[1] += (T) norms_t_dur;
timing[2] += (T) pivoting_t_dur;
timing[3] += (T) gen_reflector_1_t_dur;
timing[4] += (T) gen_reflector_2_t_dur;
timing[5] += (T) downdating_t_dur;
timing[6] += (T) downdating_t_dur;
timing[7] += (T) gen_T_t_dur;
timing[8] += (T) other_t_dur;
timing[9] += (T) total_t_dur;
//printf("%ld\n", timing[7]);
//printf("%ld\n", timing[8]);
}

// Remove auxiliary vectors.
free( buff_d );
free( buff_e );
Expand Down Expand Up @@ -763,6 +859,16 @@ int64_t hqrrp(
long updating_Sketch_t_dur = 0;
long total_t_dur = 0;

// Buffer for QRCP timing.
T* timing_QRCP = nullptr;
// Buffer for QR timing.
T* timing_QR = nullptr;

if(timing != nullptr) {
timing_QRCP = ( T * ) calloc( 10, sizeof( T ) );
timing_QR = ( T * ) calloc( 10, sizeof( T ) );
}

if(timing != nullptr) {
total_t_start = high_resolution_clock::now();
preallocation_t_start = high_resolution_clock::now();
Expand Down Expand Up @@ -935,13 +1041,8 @@ int64_t hqrrp(
if(timing != nullptr) {
downdating_t_stop = high_resolution_clock::now();
downdating_t_dur += duration_cast<microseconds>(downdating_t_stop - downdating_t_start).count();
sketching_t_start = high_resolution_clock::now();
}

//t1_stop = high_resolution_clock::now();
//printf(" Part 1 of HQRRP time %ld\n", duration_cast<microseconds>(t1_stop - t1_start).count());
//t2_start = high_resolution_clock::now();

if( !last_iter ) {
// Compute QRP of YR, and apply permutations to matrix AR.
// A copy of YR is made into VR, and permutations are applied to YR.
Expand Down Expand Up @@ -969,7 +1070,8 @@ int64_t hqrrp(
buff_pB, buff_sB,
1, m_A, buff_AR, ldim_A,
1, m_Y, buff_YR, ldim_Y,
0, (T*) nullptr, 0, (T*) nullptr, 0, (T*) nullptr
0, (T*) nullptr, 0, (T*) nullptr, 0, (T*) nullptr,
timing_QRCP
);

if(timing != nullptr) {
Expand Down Expand Up @@ -1004,7 +1106,7 @@ int64_t hqrrp(
m_AB1, n_AB1, buff_AB1, ldim_A, buff_p1, buff_s1,
1, j, buff_A01, ldim_A,
1, m_Y, buff_Y1, ldim_Y,
1, buff_T1_T, ldim_W, buff_R, ldim_R, buff_D);
1, buff_T1_T, ldim_W, buff_R, ldim_R, buff_D, timing_QR);

if(timing != nullptr) {
qr_t_stop = high_resolution_clock::now();
Expand Down Expand Up @@ -1052,18 +1154,11 @@ int64_t hqrrp(
updating_Sketch_t_dur += duration_cast<microseconds>(updating_Sketch_t_stop - updating_Sketch_t_start).count();
}
}
// Remove auxiliary objects.
free( buff_G );
free( buff_Y );
free( buff_V );
free( buff_W );
free( buff_R );
free( buff_D );

if(timing != nullptr) {

// Make sure that timing points to a sufficient amount of space.
timing = ( T * ) realloc(timing, 11 * sizeof( T ) );
timing = ( T * ) realloc(timing, 31 * sizeof( T ) );

total_t_stop = high_resolution_clock::now();
total_t_dur = duration_cast<microseconds>(total_t_stop - total_t_start).count();
Expand All @@ -1080,6 +1175,11 @@ int64_t hqrrp(
timing[8] = (T) updating_Sketch_t_dur;
timing[9] = (T) other_t_dur;
timing[10] = (T) total_t_dur;
blas::copy(10, timing_QRCP, 1, &timing[11], 1);
blas::copy(10, timing_QR, 1, &timing[21], 1);

free( timing_QRCP );
free( timing_QR );

printf("\n\n/------------HQRRP TIMING RESULTS BEGIN------------/\n");
printf("Preallocation time: %25ld μs,\n", preallocation_t_dur);
Expand All @@ -1102,6 +1202,15 @@ int64_t hqrrp(
printf("Everything else takes %20.2f%% of runtime.\n", 100 * ((T) other_t_dur / (T) total_t_dur));
printf("/-------------CQRRP TIMING RESULTS END-------------/\n\n");
}

// Remove auxiliary objects.
free( buff_G );
free( buff_Y );
free( buff_V );
free( buff_W );
free( buff_R );
free( buff_D );

return 0;
}

Expand Down
18 changes: 9 additions & 9 deletions benchmark/bench_CQRRP/HQRRP_runtime_breakdown.cc
Original file line number Diff line number Diff line change
Expand Up @@ -71,7 +71,7 @@ static void call_all_algs(
int panel_pivoting = 0;

// Timing vars
T* times = ( T * ) calloc(11, sizeof( T ) );
T* times = ( T * ) calloc(31, sizeof( T ) );

for (int i = 0; i < numruns; ++i) {
printf("Iteration %d start.\n", i);
Expand All @@ -81,7 +81,7 @@ static void call_all_algs(
RandLAPACK::hqrrp(m, n, all_data.A.data(), m, all_data.J.data(), all_data.tau.data(), b_sz, (d_factor - 1) * b_sz, panel_pivoting, 0, state_alg, times);

std::ofstream file(output_filename, std::ios::app);
std::copy(times, times + 11, std::ostream_iterator<T>(file, ", "));
std::copy(times, times + 31, std::ostream_iterator<T>(file, ", "));
file << "\n";

// Clear and re-generate data
Expand All @@ -100,14 +100,14 @@ int main() {
int64_t n = std::pow(2, 16);
double d_factor = 1.125;
int64_t b_sz_start = 256;
int64_t b_sz_end = 256;
int64_t b_sz_end = 2048;
double tol = std::pow(std::numeric_limits<double>::epsilon(), 0.85);
auto state = RandBLAS::RNGState();
auto state_constant = state;
// Timing results
std::vector<long> res;
// Number of algorithm runs.
int64_t numruns = 1;
int64_t numruns = 4;

// Allocate basic workspace
QR_speed_benchmark_data<double> all_data(m, n, tol, d_factor);
Expand All @@ -116,11 +116,11 @@ int main() {
RandLAPACK::gen::mat_gen(m_info, all_data.A.data(), state);

// Declare a data file
std::string file= "HQRRP_inner_speed_" + std::to_string(m)
+ "_cols_" + std::to_string(n)
+ "_b_sz_start_" + std::to_string(b_sz_start)
+ "_b_sz_end_" + std::to_string(b_sz_end)
+ "_d_factor_" + std::to_string(d_factor)
std::string file= "HQRRP_inner_speed_" + std::to_string(m)
+ "_cols_" + std::to_string(n)
+ "_b_sz_start_" + std::to_string(b_sz_start)
+ "_b_sz_end_" + std::to_string(b_sz_end)
+ "_d_factor_" + std::to_string(d_factor)
+ ".dat";

#if !defined(__APPLE__)
Expand Down

0 comments on commit f0adc77

Please sign in to comment.