Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

DNM: Trackable benchmarking for CQRRP #84

Open
wants to merge 36 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
36 commits
Select commit Hold shift + click to select a range
0329736
easy switching between QRF and CholQR+ORHR_CHOL for the panels
rileyjmurray Sep 10, 2024
b8e7892
results for 4k
rileyjmurray Sep 10, 2024
ea7246c
move results to reflect machine
rileyjmurray Sep 10, 2024
4546822
profiling for 16k-by-16k took no time at all!
rileyjmurray Sep 10, 2024
d984e73
CI now ignores this branch
TeachRaccooon Sep 10, 2024
af16d58
Bug fix, bench update
TeachRaccooon Sep 10, 2024
08af4f9
fix mac ci
TeachRaccooon Sep 10, 2024
7be3736
results
rileyjmurray Sep 10, 2024
1925b9d
Benchmark & CI fix
TeachRaccooon Sep 10, 2024
7be9bfb
Forgot to save yaml file
TeachRaccooon Sep 10, 2024
2a55bdd
yet another fix
TeachRaccooon Sep 10, 2024
508f169
results
rileyjmurray Sep 10, 2024
dd5693f
32k benchmarks
rileyjmurray Sep 10, 2024
0231bb6
Add smaller matrices (4423 is just a weird number, 8191 is a Mersenne…
rileyjmurray Sep 10, 2024
0de526a
Fixed redundant invocation of column swapping in CQRRP_CPU
TeachRaccooon Sep 13, 2024
2280397
Augmented the logic in performing copies of A for column pivoting, sh…
TeachRaccooon Sep 13, 2024
687ebe2
Small bug fix + benchmark improvement
TeachRaccooon Sep 14, 2024
ea319d0
Small benchmarks update
TeachRaccooon Sep 16, 2024
d2164a5
Small benchmarks update
TeachRaccooon Sep 16, 2024
7da87e7
Small bug fix
TeachRaccooon Sep 18, 2024
7e12c0c
Small bug fix
TeachRaccooon Sep 18, 2024
0bc3192
Update
TeachRaccooon Oct 1, 2024
a4e6825
Switched timing units for consistency
TeachRaccooon Oct 1, 2024
3d92940
Update
TeachRaccooon Oct 2, 2024
ebdd766
Update
TeachRaccooon Oct 3, 2024
6480e1c
Merge branch 'main' into cqrrp-gpu-benchmarking
rileyjmurray Oct 5, 2024
1b255eb
EPYC-9354P benchmarks
rileyjmurray Oct 6, 2024
e61a3ed
partial results for 64k
Oct 10, 2024
cb1a02e
source used to dispatch experiments from last commit (and ongoing exp…
Oct 10, 2024
bf81e86
results for 64k
Oct 11, 2024
0e0ff75
results
Oct 13, 2024
914f920
plain QR benchmark
Oct 14, 2024
9d0b8f5
Added new benchmark
TeachRaccooon Oct 17, 2024
51f8d9f
New files
TeachRaccooon Oct 17, 2024
2b4197d
Spiked gen bug fix
TeachRaccooon Oct 17, 2024
0444ee4
Fixed CQRRPT pivot quality benchmark bug
TeachRaccooon Oct 18, 2024
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions .github/workflows/core-linux.yaml
Original file line number Diff line number Diff line change
@@ -1,7 +1,12 @@
name: core-linux
on:
pull_request:
branches-ignore:
- cqrrp-gpu-benchmarking
workflow_dispatch:
push:
branches-ignore:
- cqrrp-gpu-benchmarking

jobs:
build:
Expand Down
5 changes: 5 additions & 0 deletions .github/workflows/core-macos.yaml
Original file line number Diff line number Diff line change
@@ -1,7 +1,12 @@
name: core-macos
on:
pull_request:
branches-ignore:
- cqrrp-gpu-benchmarking
workflow_dispatch:
push:
branches-ignore:
- cqrrp-gpu-benchmarking

jobs:
build:
Expand Down
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
benchmark/build/**
**/private_config.sh

# vim
*.sw*
Expand Down
131 changes: 66 additions & 65 deletions RandLAPACK/drivers/rl_cqrrp.hh
Original file line number Diff line number Diff line change
Expand Up @@ -56,16 +56,13 @@ class CQRRP_blocked : public CQRRPalg<T, RNG> {
T ep,
int64_t b_sz
) {
timing = time_subroutines;
eps = ep;
timing = time_subroutines;
eps = ep;
block_size = b_sz;
use_qp3 = false;
use_gemqrt = false;
internal_nb = b_sz;
tol = std::numeric_limits<T>::epsilon();
timing = time_subroutines;
eps = ep;
timing = time_subroutines;
eps = ep;
block_size = b_sz;
use_qp3 = false;
use_qrf = false;
use_gemqrt = false;
internal_nb = b_sz;
tol = std::numeric_limits<T>::epsilon();
Expand Down Expand Up @@ -134,6 +131,9 @@ class CQRRP_blocked : public CQRRPalg<T, RNG> {
// QRCP option
bool use_qp3;

// Option to use GEQRF on a panel
bool use_qrf;

// Option for updating A
bool use_gemqrt;

Expand Down Expand Up @@ -168,8 +168,8 @@ int CQRRP_blocked<T, RNG>::call(
high_resolution_clock::time_point skop_t_start;
high_resolution_clock::time_point qrcp_t_start;
high_resolution_clock::time_point qrcp_t_stop;
high_resolution_clock::time_point cholqr_t_start;
high_resolution_clock::time_point cholqr_t_stop;
high_resolution_clock::time_point panelqr_t_start;
high_resolution_clock::time_point panelqr_t_stop;
high_resolution_clock::time_point reconstruction_t_start;
high_resolution_clock::time_point reconstruction_t_stop;
high_resolution_clock::time_point preconditioning_t_start;
Expand All @@ -187,7 +187,7 @@ int CQRRP_blocked<T, RNG>::call(
long preallocation_t_dur = 0;
long skop_t_dur = 0;
long qrcp_t_dur = 0;
long cholqr_t_dur = 0;
long panelqr_t_dur = 0;
long reconstruction_t_dur = 0;
long preconditioning_t_dur = 0;
long r_piv_t_dur = 0;
Expand Down Expand Up @@ -347,22 +347,14 @@ int CQRRP_blocked<T, RNG>::call(
if(this -> timing) {
qrcp_t_stop = high_resolution_clock::now();
qrcp_t_dur += duration_cast<microseconds>(qrcp_t_stop - qrcp_t_start).count();
r_piv_t_start = high_resolution_clock::now();
preconditioning_t_start = high_resolution_clock::now();
}

// Need to premute trailing columns of the full R-factor.
// Remember that the R-factor is stored the upper-triangular portion of A.
if(iter != 0)
util::col_swap(curr_sz, cols, cols, &A[lda * curr_sz], m, J_buf);

if(this -> timing) {
r_piv_t_stop = high_resolution_clock::now();
r_piv_t_dur += duration_cast<microseconds>(r_piv_t_stop - r_piv_t_start).count();
preconditioning_t_start = high_resolution_clock::now();
}

// Pivoting the current matrix A.
util::col_swap(rows, cols, cols, A_work, lda, J_buf);
// Pivoting the trailing R and the ``current'' A.
// The copy of A operation is done on a separete stream. If it was not, it would have been done here.
util::col_swap(m, cols, cols, &A[lda * curr_sz], lda, J_buf);

// Checking for the zero matrix post-pivoting is the best idea,
// as we would only need to check one column (pivoting moves the column with the largest norm upfront)
Expand Down Expand Up @@ -424,47 +416,55 @@ int CQRRP_blocked<T, RNG>::call(
}

if(this -> timing)
cholqr_t_start = high_resolution_clock::now();
panelqr_t_start = high_resolution_clock::now();

if (use_qrf) {
// Performing QRF on a panel - this skips ORHR_COL and tau extraction
tau_sub = &tau[curr_sz];
lapack::geqrf(rows, block_rank, A_work, lda, tau_sub);
// Need to copy R into a separate buffer because there is no trtrmm in LAPACK.
lapack::lacpy(MatrixType::Upper, block_rank, block_rank, A_work, lda, R_cholqr, b_sz_const);
} else {
// Performing Cholesky QR on a panel
blas::syrk(Layout::ColMajor, Uplo::Upper, Op::Trans, block_rank, rows, (T) 1.0, A_work, lda, (T) 0.0, R_cholqr, b_sz_const);
lapack::potrf(Uplo::Upper, block_rank, R_cholqr, b_sz_const);
// Compute Q_econ from Cholesky QR
blas::trsm(Layout::ColMajor, Side::Right, Uplo::Upper, Op::NoTrans, Diag::NonUnit, rows, block_rank, (T) 1.0, R_cholqr, b_sz_const, A_work, lda);

// Performing Cholesky QR
blas::syrk(Layout::ColMajor, Uplo::Upper, Op::Trans, block_rank, rows, (T) 1.0, A_work, lda, (T) 0.0, R_cholqr, b_sz_const);
lapack::potrf(Uplo::Upper, block_rank, R_cholqr, b_sz_const);
// Compute Q_econ from Cholesky QR
blas::trsm(Layout::ColMajor, Side::Right, Uplo::Upper, Op::NoTrans, Diag::NonUnit, rows, block_rank, (T) 1.0, R_cholqr, b_sz_const, A_work, lda);
if(this -> timing) {
panelqr_t_stop = high_resolution_clock::now();
panelqr_t_dur += duration_cast<microseconds>(panelqr_t_stop - panelqr_t_start).count();
reconstruction_t_start = high_resolution_clock::now();
}

if(this -> timing) {
cholqr_t_stop = high_resolution_clock::now();
cholqr_t_dur += duration_cast<microseconds>(cholqr_t_stop - cholqr_t_start).count();
reconstruction_t_start = high_resolution_clock::now();
}
// Find Q (stored in A) using Householder reconstruction.
// This will represent the full (rows by rows) Q factor form Cholesky QR
// It would have been really nice to store T right above Q, but without using extra space,
// it would result in us loosing the first lower-triangular b_sz by b_sz portion of implicitly-stored Q.
// Filling T without ever touching its lower-triangular space would be a nice optimization for orhr_col routine.
// Q is defined with block_rank elementary reflectors.
// NOTE:
/// This routine is defined in LAPACK 3.9.0.
lapack::orhr_col(rows, block_rank, internal_nb, A_work, lda, T_dat, b_sz_const, Work2);

// Find Q (stored in A) using Householder reconstruction.
// This will represent the full (rows by rows) Q factor form Cholesky QR
// It would have been really nice to store T right above Q, but without using extra space,
// it would result in us loosing the first lower-triangular b_sz by b_sz portion of implicitly-stored Q.
// Filling T without ever touching its lower-triangular space would be a nice optimization for orhr_col routine.
// Q is defined with block_rank elementary reflectors.
// NOTE:
/// This routine is defined in LAPACK 3.9.0.
lapack::orhr_col(rows, block_rank, internal_nb, A_work, lda, T_dat, b_sz_const, Work2);

// Need to change signs in the R-factor from Cholesky QR.
// Signs correspond to matrix D from orhr_col().
// This allows us to not explicitoly compute R11_full = (Q[:, 1:block_rank])' * A_pre.
for(i = 0; i < block_rank; ++i)
for(j = 0; j < (i + 1); ++j)
R_cholqr[(b_sz_const * i) + j] *= Work2[j];

// Define a pointer to the current subportion of tau vector.
tau_sub = &tau[curr_sz];
// Entries of tau will be placed on the main diagonal of the block matrix T from orhr_col().
for(i = 0; i < block_rank; ++i)
tau_sub[i] = T_dat[(b_sz_const * i) + (i % internal_nb)];
if(this -> timing) {
reconstruction_t_stop = high_resolution_clock::now();
reconstruction_t_dur += duration_cast<microseconds>(reconstruction_t_stop - reconstruction_t_start).count();
updating1_t_start = high_resolution_clock::now();
}

if(this -> timing) {
reconstruction_t_stop = high_resolution_clock::now();
reconstruction_t_dur += duration_cast<microseconds>(reconstruction_t_stop - reconstruction_t_start).count();
updating1_t_start = high_resolution_clock::now();
// Need to change signs in the R-factor from Cholesky QR.
// Signs correspond to matrix D from orhr_col().
// This allows us to not explicitoly compute R11_full = (Q[:, 1:block_rank])' * A_pre.
for(i = 0; i < block_rank; ++i)
for(j = 0; j < (i + 1); ++j)
R_cholqr[(b_sz_const * i) + j] *= Work2[j];

// Define a pointer to the current subportion of tau vector.
tau_sub = &tau[curr_sz];
// Entries of tau will be placed on the main diagonal of the block matrix T from orhr_col().
for(i = 0; i < block_rank; ++i)
tau_sub[i] = T_dat[(b_sz_const * i) + (i % internal_nb)];
}

// Perform Q_full' * A_piv(:, b_sz:end) to find R12 and the new "current A."
Expand All @@ -474,7 +474,8 @@ int CQRRP_blocked<T, RNG>::call(
// With that, everything is placed where it should be, no copies required.
// Q is defined with block_rank elementary reflectors.
// GEMQRT is a faster alternative to ORMQR, takes in the matrix T instead of vector tau.
if(use_gemqrt) {
// Using QRF prevents us from using gemqrt unless matrix T was explicitly constructed.
if(use_gemqrt && !use_qrf) {
lapack::gemqrt(Side::Left, Op::Trans, rows, cols - b_sz, block_rank, internal_nb, A_work, lda, T_dat, b_sz_const, Work1, lda);
} else {
lapack::ormqr(Side::Left, Op::Trans, rows, cols - b_sz, block_rank, A_work, lda, tau_sub, Work1, lda);
Expand Down Expand Up @@ -532,14 +533,14 @@ int CQRRP_blocked<T, RNG>::call(
total_t_dur = duration_cast<microseconds>(total_t_stop - total_t_start).count();
long t_rest = total_t_dur - (preallocation_t_dur + skop_t_dur + qrcp_t_dur + reconstruction_t_dur + preconditioning_t_dur + updating1_t_dur + updating2_t_dur + updating3_t_dur + r_piv_t_dur);
this -> times.resize(12);
this -> times = {skop_t_dur, preallocation_t_dur, qrcp_t_dur, preconditioning_t_dur, cholqr_t_dur, reconstruction_t_dur, updating1_t_dur, updating2_t_dur, updating3_t_dur, r_piv_t_dur, t_rest, total_t_dur};
this -> times = {skop_t_dur, preallocation_t_dur, qrcp_t_dur, preconditioning_t_dur, panelqr_t_dur, reconstruction_t_dur, updating1_t_dur, updating2_t_dur, updating3_t_dur, r_piv_t_dur, t_rest, total_t_dur};

printf("\n\n/------------CQRRP TIMING RESULTS BEGIN------------/\n");
printf("Preallocation time: %25ld μs,\n", preallocation_t_dur);
printf("skop time: %34ld μs,\n", skop_t_dur);
printf("QRCP time: %36ld μs,\n", qrcp_t_dur);
printf("Preconditioning time: %24ld μs,\n", preconditioning_t_dur);
printf("CholQR time: %32ld μs,\n", cholqr_t_dur);
printf("CholQR time: %32ld μs,\n", panelqr_t_dur);
printf("Householder vector restoration time: %7ld μs,\n", reconstruction_t_dur);
printf("Computing A_new, R12 time: %23ld μs,\n", updating1_t_dur);
printf("Factors updating time: %23ld μs,\n", updating3_t_dur);
Expand All @@ -552,7 +553,7 @@ int CQRRP_blocked<T, RNG>::call(
printf("skop generation and application takes %2.2f%% of runtime.\n", 100 * ((T) skop_t_dur / (T) total_t_dur));
printf("QRCP takes %32.2f%% of runtime.\n", 100 * ((T) qrcp_t_dur / (T) total_t_dur));
printf("Preconditioning takes %20.2f%% of runtime.\n", 100 * ((T) preconditioning_t_dur / (T) total_t_dur));
printf("Cholqr takes %29.2f%% of runtime.\n", 100 * ((T) cholqr_t_dur / (T) total_t_dur));
printf("Cholqr takes %29.2f%% of runtime.\n", 100 * ((T) panelqr_t_dur / (T) total_t_dur));
printf("Householder restoration takes %12.2f%% of runtime.\n", 100 * ((T) reconstruction_t_dur / (T) total_t_dur));
printf("Computing A_new, R12 takes %14.2f%% of runtime.\n", 100 * ((T) updating1_t_dur / (T) total_t_dur));
printf("Factors updating time takes %14.2f%% of runtime.\n", 100 * ((T) updating3_t_dur / (T) total_t_dur));
Expand Down
Loading
Loading