diff --git a/RandLAPACK/drivers/rl_cqrrp.hh b/RandLAPACK/drivers/rl_cqrrp.hh index 998c0b63..564325e1 100644 --- a/RandLAPACK/drivers/rl_cqrrp.hh +++ b/RandLAPACK/drivers/rl_cqrrp.hh @@ -85,6 +85,9 @@ class CQRRP_blocked : public CQRRPalg { /// @param[in] tau /// Pointer to a vector of size n. On entry, is empty. /// + /// @param[in] state + /// RNG state parameter, required for sketching operator generation. + /// /// @param[out] A /// Overwritten by Implicit Q and explicit R factors. /// diff --git a/RandLAPACK/drivers/rl_cqrrpt.hh b/RandLAPACK/drivers/rl_cqrrpt.hh index 77406edd..70aa3743 100644 --- a/RandLAPACK/drivers/rl_cqrrpt.hh +++ b/RandLAPACK/drivers/rl_cqrrpt.hh @@ -92,6 +92,9 @@ class CQRRPT : public CQRRPTalg { /// Represents the upper-triangular R factor of QR factorization. /// On entry, is empty and may not have any space allocated for it. /// + /// @param[in] state + /// RNG state parameter, required for sketching operator generation. + /// /// @param[out] A /// Overwritten by an m-by-k orthogonal Q factor. /// Matrix is stored explicitly. diff --git a/RandLAPACK/drivers/rl_nysbki.hh b/RandLAPACK/drivers/rl_nysbki.hh deleted file mode 100644 index 439c4c02..00000000 --- a/RandLAPACK/drivers/rl_nysbki.hh +++ /dev/null @@ -1,117 +0,0 @@ -#ifndef randlapack_NysBKI_h -#define randlapack_NysBKI_h - -#include "rl_util.hh" -#include "rl_blaspp.hh" -#include "rl_lapackpp.hh" -#include "rl_hqrrp.hh" - -#include -#include -#include -#include -#include -#include - -using namespace std::chrono; - -namespace RandLAPACK { - -template -class NysBKIalg { - public: - virtual ~NysBKIalg() {} - virtual int call( - int64_t m, - T* A, - int64_t lda, - int64_t k, - T* V, - T* Lambda, - RandBLAS::RNGState &state - ) = 0; -}; - -template -class NysBKI : public NysBKIalg { - public: - NysBKI( - bool verb, - bool time_subroutines, - T ep - ) { - verbosity = verb; - timing = time_subroutines; - tol = ep; - max_krylov_iters = INT_MAX; - } - int call( - int64_t m, - T* A, - int64_t lda, - int64_t k, - T* V, - T* Lambda, - RandBLAS::RNGState &state - ) override; - public: - bool verbosity; - bool timing; - T tol; - int num_krylov_iters; - int max_krylov_iters; - std::vector times; - T norm_R_end; -}; - -// ----------------------------------------------------------------------------- -template -int NysBKI::call( - int64_t m, - T* A, - int64_t lda, - int64_t k, - T* V, - T* Lambda, - RandBLAS::RNGState &state -){ - int iter = 0; - - T* X = ( T * ) calloc( m * (m + k), sizeof( T ) ); - T* X_i = X; - T* Y = ( T * ) calloc( m * (m + k), sizeof( T ) ); - T* Y_i = Y; - - // tau space for QR - T* tau = ( T * ) calloc( k, sizeof( T ) ); - - - // Generate a dense Gaussian random matrx. - RandBLAS::DenseDist D(m, k); - state = RandBLAS::fill_dense(D, X_i, state).second; - // [X_i, ~] = qr(randn(m, m), 0) - lapack::geqrf(m, k, X_i, m, tau); - // Y_i = A * X_i - blas::gemm(Layout::ColMajor, Op::NoTrans, Op::NoTrans, m, k, m, 1.0, A, m, X_i, m, 0.0, Y_i, m); - - while(iter < max_krylov_iters) { - // Advance X_i pointer - X_i = X_i + (m * k); - lapack::lacpy(MatrixType::Upper, m, k, X, m, X_i, m); - - if (!iter) { - // X_i+1 = Y_i + tol * X_i; - blas::scal(m * k, this->tol, X_i, 1); - blas::axpy(m * k, 1.0, Y_i, 1, X_i, 1); - } else { - - } - - - - } - - return 0; -} -} // end namespace RandLAPACK -#endif \ No newline at end of file diff --git a/RandLAPACK/drivers/rl_rbki.hh b/RandLAPACK/drivers/rl_rbki.hh index 69261b1f..71ed4c4f 100644 --- a/RandLAPACK/drivers/rl_rbki.hh +++ b/RandLAPACK/drivers/rl_rbki.hh @@ -21,6 +21,21 @@ namespace RandLAPACK { template class RBKIalg { public: + + /// RBKI algorithm is a method for finding truncated SVD based on block Krylov iterations. + /// This algorithm is a version of Algroithm A.1 from https://arxiv.org/pdf/2306.12418.pdf + /// + /// The main difference is in the fact that an economy SVD is performed only once at the very end + /// of the algorithm run and that the termination criteria is not based on singular vectir residual evaluation. + /// Instead, the scheme terminates if: + /// 1. ||R||_F > sqrt(1 - eps^2) ||A||_F, which ensures that we've exhausted all vectors and doing more + /// iterations would bring no benefit or that ||A - hat(A)||_F < eps * ||A||_F. + /// 2. Stop if the bottom right entry of R or S is numerically close to zero (up to square root of machine eps). + /// + /// The main cos of this algorithm comes from large GEMMs with the input matrix A. + /// + /// The algorithm optionally times all of its subcomponents through a user-defined 'timing' parameter. + virtual ~RBKIalg() {} virtual int call( int64_t m, @@ -48,6 +63,51 @@ class RBKI : public RBKIalg { tol = ep; max_krylov_iters = INT_MAX; } + + /// Computes a QR factorization with column pivots of the form: + /// A[:, J] = QR, + /// where Q and R are of size m-by-k and k-by-n, with rank(A) = k. + /// Stores implict Q factor and explicit R factor in A's space (output formatted exactly like GEQP3). + /// + /// @param[in] m + /// The number of rows in the matrix A. + /// + /// @param[in] n + /// The number of columns in the matrix A. + /// + /// @param[in] A + /// Pointer to the m-by-n matrix A, stored in a column-major format. + /// + /// @param[in] lda + /// Leading dimension of A. + /// + /// @param[in] k + /// Sampling dimension of a sketching operator, m >= (k * n) >= n. + /// + /// @param[in] U + /// On output, an empty matrix. + /// + /// @param[in] VT + /// On output, an empty matrix. + /// + /// @param[in] Sigma + /// On output, an empty matrix. + /// + /// @param[in] state + /// RNG state parameter, required for sketching operator generation. + /// + /// @param[out] U + /// Stores m by ((num_iters / 2) * k) orthonormal matrix of left singular vectors. + /// + /// @param[out] VT + /// Stores ((num_iters / 2) * k) * n orthonormal matrix of right singular vectors. + /// + /// @param[out] Sigma + /// Stores ((num_iters / 2) * k) singular values. + /// + /// @return = 0: successful exit + /// + int call( int64_t m, int64_t n,