From f0adc7748583d91146ad973b818d83a4ea9d7407 Mon Sep 17 00:00:00 2001 From: TeachRaccooon Date: Mon, 29 Apr 2024 14:34:25 -0700 Subject: [PATCH] Thohough HQRRP profiling --- RandLAPACK/drivers/rl_hqrrp.hh | 141 ++++++++++++++++-- .../bench_CQRRP/HQRRP_runtime_breakdown.cc | 18 +-- 2 files changed, 134 insertions(+), 25 deletions(-) diff --git a/RandLAPACK/drivers/rl_hqrrp.hh b/RandLAPACK/drivers/rl_hqrrp.hh index e36c1a38..df15df6f 100644 --- a/RandLAPACK/drivers/rl_hqrrp.hh +++ b/RandLAPACK/drivers/rl_hqrrp.hh @@ -580,7 +580,7 @@ int64_t NoFLA_QRPmod_WY_unb_var4( int64_t * buff_p, T * buff_t, int64_t pivot_B, int64_t m_B, T * buff_B, int64_t ldim_B, int64_t pivot_C, int64_t m_C, T * buff_C, int64_t ldim_C, - int64_t build_T, T * buff_T, int64_t ldim_T, T* buff_R, int64_t ldim_R, T* buff_D) { + int64_t build_T, T * buff_T, int64_t ldim_T, T* buff_R, int64_t ldim_R, T* buff_D, T* timing) { // // "pivoting": If pivoting==1, then QR factorization with pivoting is used. // @@ -607,6 +607,36 @@ int64_t NoFLA_QRPmod_WY_unb_var4( } */ + high_resolution_clock::time_point preallocation_t_start; + high_resolution_clock::time_point preallocation_t_stop; + high_resolution_clock::time_point norms_t_start; + high_resolution_clock::time_point norms_t_stop; + high_resolution_clock::time_point pivoting_t_start; + high_resolution_clock::time_point pivoting_t_stop; + high_resolution_clock::time_point gen_reflector_1_t_start; + high_resolution_clock::time_point gen_reflector_1_t_stop; + high_resolution_clock::time_point gen_reflector_2_t_start; + high_resolution_clock::time_point gen_reflector_2_t_stop; + high_resolution_clock::time_point downdating_t_start; + high_resolution_clock::time_point downdating_t_stop; + high_resolution_clock::time_point gen_T_t_start; + high_resolution_clock::time_point gen_T_t_stop; + high_resolution_clock::time_point total_t_start; + high_resolution_clock::time_point total_t_stop; + long preallocation_t_dur = 0; + long norms_t_dur = 0; + long pivoting_t_dur = 0; + long gen_reflector_1_t_dur = 0; + long gen_reflector_2_t_dur = 0; + long downdating_t_dur = 0; + long gen_T_t_dur = 0; + long total_t_dur = 0; + + if(timing != nullptr) { + total_t_start = high_resolution_clock::now(); + preallocation_t_start = high_resolution_clock::now(); + } + int64_t j, mn_A, m_a21, m_A22, n_A22, n_dB, idx_max_col, i_one = 1, n_house_vector, m_rest; T * buff_d, * buff_e, * buff_workspace, diag; @@ -624,9 +654,20 @@ int64_t NoFLA_QRPmod_WY_unb_var4( buff_e = ( T * ) calloc( n_A, sizeof( T ) ); buff_workspace = ( T * ) calloc( n_A, sizeof( T ) ); + if(timing != nullptr) { + preallocation_t_stop = high_resolution_clock::now(); + preallocation_t_dur = duration_cast(preallocation_t_stop - preallocation_t_start).count(); + norms_t_start = high_resolution_clock::now(); + } + // Compute initial norms of A int64_to d and e. NoFLA_QRP_compute_norms( m_A, n_A, buff_A, ldim_A, buff_d, buff_e ); + if(timing != nullptr) { + norms_t_stop = high_resolution_clock::now(); + norms_t_dur = duration_cast(norms_t_stop - norms_t_start).count(); + } + // Main Loop. for( j = 0; j < num_stages; j++ ) { @@ -638,6 +679,10 @@ int64_t NoFLA_QRPmod_WY_unb_var4( // Obtain the index of the column with largest 2-norm. idx_max_col = blas::iamax( n_dB, & buff_d[ j ], i_one ); // - 1; + if(timing != nullptr) { + pivoting_t_start = high_resolution_clock::now(); + } + // Swap columns of A, B, C, pivots, and norms vectors. NoFLA_QRP_pivot_G_B_C( idx_max_col, m_A, & buff_A[ 0 + j * ldim_A ], ldim_A, @@ -647,6 +692,12 @@ int64_t NoFLA_QRPmod_WY_unb_var4( & buff_d[ j ], & buff_e[ j ] ); + if(timing != nullptr) { + pivoting_t_stop = high_resolution_clock::now(); + pivoting_t_dur += duration_cast(pivoting_t_stop - pivoting_t_start).count(); + gen_reflector_1_t_start = high_resolution_clock::now(); + } + // Compute tau1 and u21 from alpha11 and a21 such that tau1 and u21 // determine a Householder transform H such that applying H from the // left to the column vector consisting of alpha11 and a21 annihilates @@ -659,6 +710,12 @@ int64_t NoFLA_QRPmod_WY_unb_var4( & buff_t[j] ); + if(timing != nullptr) { + gen_reflector_1_t_stop = high_resolution_clock::now(); + gen_reflector_1_t_dur += duration_cast(gen_reflector_1_t_stop - gen_reflector_1_t_start).count(); + gen_reflector_2_t_start = high_resolution_clock::now(); + } + // | a12t | = H | a12t | // | A22 | | A22 | // @@ -674,12 +731,27 @@ int64_t NoFLA_QRPmod_WY_unb_var4( ); buff_A[ j + j * ldim_A ] = diag; + if(timing != nullptr) { + gen_reflector_2_t_stop = high_resolution_clock::now(); + gen_reflector_2_t_dur += duration_cast(gen_reflector_2_t_stop - gen_reflector_2_t_start).count(); + downdating_t_start = high_resolution_clock::now(); + } + // Update partial column norms. NoFLA_QRP_downdate_partial_norms( m_A22, n_A22, & buff_d[ j+1 ], 1, & buff_e[ j+1 ], 1, & buff_A[ j + ( j+1 ) * ldim_A ], ldim_A, & buff_A[ ( j+1 ) + std::min( n_A-1, ( j+1 ) ) * ldim_A ], ldim_A ); + + if(timing != nullptr) { + downdating_t_stop = high_resolution_clock::now(); + downdating_t_dur += duration_cast(downdating_t_stop - downdating_t_start).count(); + } + } + + if(timing != nullptr) { + gen_T_t_start = high_resolution_clock::now(); } // Build T. @@ -690,6 +762,30 @@ int64_t NoFLA_QRPmod_WY_unb_var4( buff_t, buff_T, ldim_T); } + if(timing != nullptr) { + gen_T_t_stop = high_resolution_clock::now(); + gen_T_t_dur = duration_cast(gen_T_t_stop - gen_T_t_start).count(); + } + + if(timing != nullptr) { + total_t_stop = high_resolution_clock::now(); + total_t_dur = duration_cast(total_t_stop - total_t_start).count(); + long other_t_dur = total_t_dur - (preallocation_t_dur + norms_t_dur + pivoting_t_dur + gen_reflector_1_t_dur + gen_reflector_2_t_dur + downdating_t_dur + gen_T_t_dur); + + timing[0] += (T) preallocation_t_dur; + timing[1] += (T) norms_t_dur; + timing[2] += (T) pivoting_t_dur; + timing[3] += (T) gen_reflector_1_t_dur; + timing[4] += (T) gen_reflector_2_t_dur; + timing[5] += (T) downdating_t_dur; + timing[6] += (T) downdating_t_dur; + timing[7] += (T) gen_T_t_dur; + timing[8] += (T) other_t_dur; + timing[9] += (T) total_t_dur; + //printf("%ld\n", timing[7]); + //printf("%ld\n", timing[8]); + } + // Remove auxiliary vectors. free( buff_d ); free( buff_e ); @@ -763,6 +859,16 @@ int64_t hqrrp( long updating_Sketch_t_dur = 0; long total_t_dur = 0; + // Buffer for QRCP timing. + T* timing_QRCP = nullptr; + // Buffer for QR timing. + T* timing_QR = nullptr; + + if(timing != nullptr) { + timing_QRCP = ( T * ) calloc( 10, sizeof( T ) ); + timing_QR = ( T * ) calloc( 10, sizeof( T ) ); + } + if(timing != nullptr) { total_t_start = high_resolution_clock::now(); preallocation_t_start = high_resolution_clock::now(); @@ -935,13 +1041,8 @@ int64_t hqrrp( if(timing != nullptr) { downdating_t_stop = high_resolution_clock::now(); downdating_t_dur += duration_cast(downdating_t_stop - downdating_t_start).count(); - sketching_t_start = high_resolution_clock::now(); } - //t1_stop = high_resolution_clock::now(); - //printf(" Part 1 of HQRRP time %ld\n", duration_cast(t1_stop - t1_start).count()); - //t2_start = high_resolution_clock::now(); - if( !last_iter ) { // Compute QRP of YR, and apply permutations to matrix AR. // A copy of YR is made into VR, and permutations are applied to YR. @@ -969,7 +1070,8 @@ int64_t hqrrp( buff_pB, buff_sB, 1, m_A, buff_AR, ldim_A, 1, m_Y, buff_YR, ldim_Y, - 0, (T*) nullptr, 0, (T*) nullptr, 0, (T*) nullptr + 0, (T*) nullptr, 0, (T*) nullptr, 0, (T*) nullptr, + timing_QRCP ); if(timing != nullptr) { @@ -1004,7 +1106,7 @@ int64_t hqrrp( m_AB1, n_AB1, buff_AB1, ldim_A, buff_p1, buff_s1, 1, j, buff_A01, ldim_A, 1, m_Y, buff_Y1, ldim_Y, - 1, buff_T1_T, ldim_W, buff_R, ldim_R, buff_D); + 1, buff_T1_T, ldim_W, buff_R, ldim_R, buff_D, timing_QR); if(timing != nullptr) { qr_t_stop = high_resolution_clock::now(); @@ -1052,18 +1154,11 @@ int64_t hqrrp( updating_Sketch_t_dur += duration_cast(updating_Sketch_t_stop - updating_Sketch_t_start).count(); } } - // Remove auxiliary objects. - free( buff_G ); - free( buff_Y ); - free( buff_V ); - free( buff_W ); - free( buff_R ); - free( buff_D ); if(timing != nullptr) { // Make sure that timing points to a sufficient amount of space. - timing = ( T * ) realloc(timing, 11 * sizeof( T ) ); + timing = ( T * ) realloc(timing, 31 * sizeof( T ) ); total_t_stop = high_resolution_clock::now(); total_t_dur = duration_cast(total_t_stop - total_t_start).count(); @@ -1080,6 +1175,11 @@ int64_t hqrrp( timing[8] = (T) updating_Sketch_t_dur; timing[9] = (T) other_t_dur; timing[10] = (T) total_t_dur; + blas::copy(10, timing_QRCP, 1, &timing[11], 1); + blas::copy(10, timing_QR, 1, &timing[21], 1); + + free( timing_QRCP ); + free( timing_QR ); printf("\n\n/------------HQRRP TIMING RESULTS BEGIN------------/\n"); printf("Preallocation time: %25ld μs,\n", preallocation_t_dur); @@ -1102,6 +1202,15 @@ int64_t hqrrp( printf("Everything else takes %20.2f%% of runtime.\n", 100 * ((T) other_t_dur / (T) total_t_dur)); printf("/-------------CQRRP TIMING RESULTS END-------------/\n\n"); } + + // Remove auxiliary objects. + free( buff_G ); + free( buff_Y ); + free( buff_V ); + free( buff_W ); + free( buff_R ); + free( buff_D ); + return 0; } diff --git a/benchmark/bench_CQRRP/HQRRP_runtime_breakdown.cc b/benchmark/bench_CQRRP/HQRRP_runtime_breakdown.cc index 8f11263a..a6fd532a 100644 --- a/benchmark/bench_CQRRP/HQRRP_runtime_breakdown.cc +++ b/benchmark/bench_CQRRP/HQRRP_runtime_breakdown.cc @@ -71,7 +71,7 @@ static void call_all_algs( int panel_pivoting = 0; // Timing vars - T* times = ( T * ) calloc(11, sizeof( T ) ); + T* times = ( T * ) calloc(31, sizeof( T ) ); for (int i = 0; i < numruns; ++i) { printf("Iteration %d start.\n", i); @@ -81,7 +81,7 @@ static void call_all_algs( RandLAPACK::hqrrp(m, n, all_data.A.data(), m, all_data.J.data(), all_data.tau.data(), b_sz, (d_factor - 1) * b_sz, panel_pivoting, 0, state_alg, times); std::ofstream file(output_filename, std::ios::app); - std::copy(times, times + 11, std::ostream_iterator(file, ", ")); + std::copy(times, times + 31, std::ostream_iterator(file, ", ")); file << "\n"; // Clear and re-generate data @@ -100,14 +100,14 @@ int main() { int64_t n = std::pow(2, 16); double d_factor = 1.125; int64_t b_sz_start = 256; - int64_t b_sz_end = 256; + int64_t b_sz_end = 2048; double tol = std::pow(std::numeric_limits::epsilon(), 0.85); auto state = RandBLAS::RNGState(); auto state_constant = state; // Timing results std::vector res; // Number of algorithm runs. - int64_t numruns = 1; + int64_t numruns = 4; // Allocate basic workspace QR_speed_benchmark_data all_data(m, n, tol, d_factor); @@ -116,11 +116,11 @@ int main() { RandLAPACK::gen::mat_gen(m_info, all_data.A.data(), state); // Declare a data file - std::string file= "HQRRP_inner_speed_" + std::to_string(m) - + "_cols_" + std::to_string(n) - + "_b_sz_start_" + std::to_string(b_sz_start) - + "_b_sz_end_" + std::to_string(b_sz_end) - + "_d_factor_" + std::to_string(d_factor) + std::string file= "HQRRP_inner_speed_" + std::to_string(m) + + "_cols_" + std::to_string(n) + + "_b_sz_start_" + std::to_string(b_sz_start) + + "_b_sz_end_" + std::to_string(b_sz_end) + + "_d_factor_" + std::to_string(d_factor) + ".dat"; #if !defined(__APPLE__)