diff --git a/.gitignore b/.gitignore index 1cdca533..b140e8a4 100644 --- a/.gitignore +++ b/.gitignore @@ -1,4 +1,6 @@ benchmark/build/** +benchmark/bench_kernelalgs/**.py +benchmark/bench_kernelalgs/rpcpy # vim *.sw* diff --git a/CMake/rl_version.cmake b/CMake/rl_version.cmake index 789b7280..48eec794 100644 --- a/CMake/rl_version.cmake +++ b/CMake/rl_version.cmake @@ -1,27 +1,54 @@ +# Initialize tmp variable set(tmp) + +# Find Git executable find_package(Git QUIET) if(GIT_FOUND) - execute_process(COMMAND ${GIT_EXECUTABLE} - --git-dir=${CMAKE_SOURCE_DIR}/.git describe - --tags --match "[0-9]*.[0-9]*.[0-9]*" - OUTPUT_VARIABLE tmp OUTPUT_STRIP_TRAILING_WHITESPACE - ERROR_QUIET) + message(STATUS "Git found: ${GIT_EXECUTABLE}") + execute_process( + COMMAND ${GIT_EXECUTABLE} --git-dir=${CMAKE_SOURCE_DIR}/.git describe --tags --match "[0-9]*.[0-9]*.[0-9]*" + OUTPUT_VARIABLE tmp + OUTPUT_STRIP_TRAILING_WHITESPACE + ERROR_VARIABLE git_error + RESULT_VARIABLE git_result + ) + + # Print the result of the Git command + message(STATUS "Git command result: ${git_result}") + message(STATUS "Git command output: ${tmp}") + if(NOT git_result EQUAL 0) + message(WARNING "Git command failed with error: ${git_error}") + set(tmp "0.0.0") + endif() +else() + message(WARNING "Git not found, using fallback version 0.0.0") + set(tmp "0.0.0") endif() + +# Check if tmp is empty and set a fallback version if necessary if(NOT tmp) + message(WARNING "Git describe output is empty, using fallback version 0.0.0") set(tmp "0.0.0") endif() -set(RandLAPACK_VERSION ${tmp} CACHE STRING "RandLAPACK version" FORCE) +# Debugging: Print tmp before setting RandLAPACK_VERSION +message(STATUS "tmp before setting RandLAPACK_VERSION: ${tmp}") -string(REGEX REPLACE "^([0-9]+)\\.([0-9]+)\\.([0-9]+)(.*$)" - "\\1" RandLAPACK_VERSION_MAJOR ${RandLAPACK_VERSION}) +# Set RandLAPACK_VERSION without CACHE option +set(RandLAPACK_VERSION "${tmp}") +message(STATUS "RandLAPACK_VERSION after setting: ${RandLAPACK_VERSION}") -string(REGEX REPLACE "^([0-9]+)\\.([0-9]+)\\.([0-9]+)(.*$)" - "\\2" RandLAPACK_VERSION_MINOR ${RandLAPACK_VERSION}) +# Ensure RandLAPACK_VERSION is not empty +if(NOT RandLAPACK_VERSION) + message(FATAL_ERROR "RandLAPACK_VERSION is empty") +endif() -string(REGEX REPLACE "^([0-9]+)\\.([0-9]+)\\.([0-9]+)(.*$)" - "\\3" RandLAPACK_VERSION_PATCH ${RandLAPACK_VERSION}) +# Extract major, minor, and patch versions +string(REGEX REPLACE "^([0-9]+)\\.([0-9]+)\\.([0-9]+)(.*)$" "\\1" RandLAPACK_VERSION_MAJOR "${RandLAPACK_VERSION}") +string(REGEX REPLACE "^([0-9]+)\\.([0-9]+)\\.([0-9]+)(.*)$" "\\2" RandLAPACK_VERSION_MINOR "${RandLAPACK_VERSION}") +string(REGEX REPLACE "^([0-9]+)\\.([0-9]+)\\.([0-9]+)(.*)$" "\\3" RandLAPACK_VERSION_PATCH "${RandLAPACK_VERSION}") +# Print extracted version components message(STATUS "RandLAPACK_VERSION_MAJOR=${RandLAPACK_VERSION_MAJOR}") message(STATUS "RandLAPACK_VERSION_MINOR=${RandLAPACK_VERSION_MINOR}") message(STATUS "RandLAPACK_VERSION_PATCH=${RandLAPACK_VERSION_PATCH}") diff --git a/RandBLAS b/RandBLAS index 172d0963..5ca3b3e5 160000 --- a/RandBLAS +++ b/RandBLAS @@ -1 +1 @@ -Subproject commit 172d0963f16743defa646b32e7e0279b52230f99 +Subproject commit 5ca3b3e573f2a7c3509cc5362bf3a00a7e8e2ff6 diff --git a/RandLAPACK.hh b/RandLAPACK.hh index 6fceabdb..c3818ae7 100644 --- a/RandLAPACK.hh +++ b/RandLAPACK.hh @@ -1,10 +1,16 @@ #ifndef RANDLAPACK_HH #define RANDLAPACK_HH +// config and dependencies +#include "RandLAPACK/rl_blaspp.hh" +#include "RandLAPACK/rl_lapackpp.hh" +#include "RandBLAS.hh" + // misc #include "RandLAPACK/misc/rl_util.hh" #include "RandLAPACK/misc/rl_linops.hh" #include "RandLAPACK/misc/rl_gen.hh" +#include "RandLAPACK/misc/rl_pdkernels.hh" // Computational routines #include "RandLAPACK/comps/rl_determiter.hh" @@ -15,6 +21,7 @@ #include "RandLAPACK/comps/rl_syps.hh" #include "RandLAPACK/comps/rl_syrf.hh" #include "RandLAPACK/comps/rl_orth.hh" +#include "RandLAPACK/comps/rl_rpchol.hh" // Drivers #include "RandLAPACK/drivers/rl_rsvd.hh" @@ -22,6 +29,7 @@ #include "RandLAPACK/drivers/rl_cqrrp.hh" #include "RandLAPACK/drivers/rl_revd2.hh" #include "RandLAPACK/drivers/rl_rbki.hh" +#include "RandLAPACK/drivers/rl_krillx.hh" // Cuda functions - issues with linking/visibility when present if the below is uncommented. // A temporary fix is to add the below directly in the test/benchmark files. diff --git a/RandLAPACK/CMakeLists.txt b/RandLAPACK/CMakeLists.txt index 055e4a12..7c6d90fd 100644 --- a/RandLAPACK/CMakeLists.txt +++ b/RandLAPACK/CMakeLists.txt @@ -6,6 +6,7 @@ set(RandLAPACK_cxx_sources rl_cqrrp.hh rl_rsvd.hh rl_revd2.hh + rl_krillx.hh rl_qb.hh rl_orth.hh rl_util.hh @@ -14,6 +15,7 @@ set(RandLAPACK_cxx_sources rl_rf.hh rl_syps.hh rl_syrf.hh + rl_rpchol.hh rl_gen.hh rl_blaspp.hh rl_linops.hh diff --git a/RandLAPACK/comps/rl_determiter.hh b/RandLAPACK/comps/rl_determiter.hh index d8d292f2..af974edb 100644 --- a/RandLAPACK/comps/rl_determiter.hh +++ b/RandLAPACK/comps/rl_determiter.hh @@ -1,6 +1,7 @@ #pragma once #include "rl_blaspp.hh" +#include "rl_linops.hh" #include #include @@ -8,11 +9,14 @@ namespace RandLAPACK { -// moved run_pcgls_ex to test -// void run_pcgls_ex(int n, int m); +/* Solve the saddle point problem + (A'A + mu*I)x = A'b - c + Have access to a matrix M such that + (A'A + mu*I)MM' is well-conditioned. +*/ template -void pcg( +void pcg_saddle( int64_t m, int64_t n, const T* A, @@ -28,8 +32,7 @@ void pcg( const T* x0, // length n T* x, // length n T* y // length m - ) -{ +) { std::vector out_a1(m, 0.0); std::vector out_at1(n, 0.0); std::vector out_m1(n, 0.0); @@ -126,4 +129,315 @@ void pcg( blas::gemv(Layout::ColMajor, Op::NoTrans, m, n, -1.0, A, lda, x, 1, 1.0, y, 1); } + +// MARK: [L/B]PCG helpers + +template +struct StatefulSeminorm { + ~StatefulSeminorm() {}; + virtual T operator()(int64_t n, int64_t s, const T* NR) = 0; +}; + +template +struct StatefulFrobeniusNorm { + std::vector history; + StatefulFrobeniusNorm() : history() {}; + inline T operator()(int64_t n, int64_t s, const T* NR) { + T nrm = blas::nrm2(n * s, NR, 1); + this->history.push_back(nrm); + return nrm; + }; +}; + +template +void zero_off_diagonal(T* mat, int64_t s) { + for (int64_t i = 0; i < s - 1; ++i) { + T* ptr_to_next_diag = mat + i + i*s; + blas::scal(s, 0.0, ptr_to_next_diag + 1, 1); + } +} + +/** + * A is a symmetric column-major matrix represented by its lower triangle. + * + * If A is not PSD then this function returns an error code -(n+2). + * If A is (near) zero then this function returns an error code -(n+1). + * In all other cases this function returns k = dim(ker(A)). + * + * If A is PSD then its trailing n - k columns will be overwritten by a + * matrix B where pinv(A) = BB'. + * + * @param[in] n matrix dimension + * @param[in,out] A buffer for symmetric n-by-n matrix stored in host memory. + * @param[in] lda leading dimension of A. + * @param[out] work buffer of length >= n; overwritten by the eigenvalues of A. + * + * @returns k = dim(ker(A)) + */ +template +int64_t psd_sqrt_pinv( + int64_t n, + T* A, + int64_t lda, + T* work +) { + lapack::syevd(lapack::Job::Vec, blas::Uplo::Lower, n, A, lda, work); + T rel_tol = 10 * std::numeric_limits::epsilon(); + T abs_tol = rel_tol * std::max(1.0, work[n - 1]); + if (work[0] < -abs_tol) { + std::cout << "The input matrix was not positive semidefinite." << std::endl; + return -(n + 1); + } else if (work[n - 1] < abs_tol) { + std::cout << "The input matrix is zero, up to numerical precision." << std::endl; + return -(n + 2); + } + int ker = n; + while(ker > 0) { + if (work[ker - 1] > abs_tol) { + blas::scal(n, 1/std::sqrt(work[ker - 1]), &A[(ker - 1) * n], 1); + ker = ker - 1; + } else { + break; + } + } + return ker; +} + +/** + * Check if LHS is PSD. If it is, then update RHS <- pinv(LHS)*RHS. + * + * First we try to Cholesky decompose LHS. If that fails, we compute + * its eigendecomposition. If the eigendecomposition shows that LHS + * is (close to) the zero matrix or has negative eigenvalues then we + * return an error code. Otherwise, we use the eigendecomposition to + * perform the update for RHS. + * + * @param[in] n + * Matrix dimension + * @param[in,out] LHS + * buffer for an n-by-n matrix. + * Contents of this buffer are destroyed. + * @param[in] lda + * Leading dimension of LHS. + * @param[in,out] RHS + * buffer for n-by-n matrix. + * @param[in] ldb + * Leading dimension of RHS. + * @param[out] work + * buffer of size >= n*n. + * + * @returns k = rank(LHS), or an error code. + */ +template +int64_t posm_square( + int64_t n, + std::vector & LHS, + int64_t lda, + std::vector & RHS, + int64_t ldb, + std::vector & work +) { + auto layout = blas::Layout::ColMajor; + auto uplo = blas::Uplo::Lower; + using blas::Op; + using blas::Side; + using blas::Diag; + assert(n * n <= (int64_t) work.size()); + + // Try Cholesky (store a backup of LHS into "work") + std::copy(LHS.begin(), LHS.end(), work.begin()); + int chol_err = lapack::potrf(uplo, n, LHS.data(), lda); + if (!chol_err) { + blas::trsm( + layout, Side::Left, uplo, Op::NoTrans, + Diag::NonUnit, n, n, 1.0, LHS.data(), lda, RHS.data(), ldb + ); // L y = b + blas::trsm( + layout, Side::Left, uplo, Op::Trans, + Diag::NonUnit, n, n, 1.0, LHS.data(), lda, RHS.data(), ldb + ); // L^T x = y + return n; + } + // Cholesky failed. + // apply pinv(LHS) * RHS by computing an eigendecomposition of LHS. + T* LHS_eigvecs = work.data(); + T* LHS_eigvals = LHS.data(); + int ker = psd_sqrt_pinv(n, LHS_eigvecs, n, LHS_eigvals); + if (ker < 0) { + return ker; + } else if (ker == n) { + T* rhs = RHS.data(); + for (int i = 0; i < n; ++i) { + for (int j = 0; j < n; ++j) { + rhs[i + lda*j] = 0.0; + } + } + return 0; + } + int rank = n - ker; + T* pinv_sqrt = &LHS_eigvecs[ker * n]; + + // pinv_sqrt is n-by-rank, and pinv(LHS) = pinv_sqrt * (pinv_sqrt'). + blas::gemm( + layout, Op::Trans, Op::NoTrans, rank, n, n, 1.0, pinv_sqrt, n, RHS.data(), n, 0.0, work.data(), rank + ); // work <- pinv_sqrt' * RHS + blas::gemm( + layout, Op::NoTrans, Op::NoTrans, n, n, rank, 1.0, pinv_sqrt, n, work.data(), rank, 0.0, RHS.data(), n + ); // RHS <- pinv_sqrt * work + return rank; +} + +namespace hidden { + + +// bool should_stop(int64_t &k, int64_t &stalls, double normNR, double prevnormNR, double normNR0) { +// if (normNR < 1e-12 + 1e-9 * normNR0) { +// return true; +// } else if (normNR > 0.8 * prevnormNR) { +// if (stalls < 5) { +// stalls++; +// } else { +// k = -k; +// return true; +// } +// } else { +// stalls = 0; +// } +// return false; +// } + +} + +// MARK: [L/B]PCG + +template +void lockorblock_pcg( + FG &G, + const std::vector &H, + T tol, + int64_t max_iters, + FN &N, + FSeminorm &seminorm, + std::vector &X, + bool verbose = false +) { + int64_t n = G.m; + randblas_require(n == N.m); + int64_t s = ((int64_t) H.size()) / n; + int64_t ns = n*s; + int64_t ss = s*s; + randblas_require(ns == (int64_t) H.size()); + randblas_require(ns == (int64_t) X.size()); + bool treat_as_separable = G.regs.size() > 1; + if (treat_as_separable) + randblas_require(s == (int64_t) G.regs.size()); + + using std::vector; + + vector R(H); + vector P(ns, 0.0); + vector GP(P); + vector NR_or_scratch(P); + + vector RNR(ss, 0.0); + vector alpha(RNR); + vector beta(RNR); + vector more_scratch(RNR); + vector alpha_beta_left_buffer(RNR); + + T normNR = INFINITY, prevnormNR = INFINITY; + + auto layout = blas::Layout::ColMajor; + using blas::Op; + + G(layout, s, 1.0, X.data(), n, 0.0, GP.data(), n); + // ^ GP <- G X + blas::axpy(ns, -1.0, GP.data(), 1, R.data(), 1); + T normR0 = seminorm(n, s, R.data()); + // ^ R <- R - G X + N(layout, s, 1.0, R.data(), n, 0.0, P.data(), n); + // ^ P <- N R + T normNR0 = seminorm(n, s, P.data()); + blas::gemm( + layout, Op::Trans, Op::NoTrans, s, s, n, 1.0, R.data(), n, P.data(), n, 0.0, RNR.data(), s + ); // RNR <- R^T P = R^T N R + if (treat_as_separable) + zero_off_diagonal(RNR.data(), s); + alpha = RNR; + + int64_t k = 0; + T stop_abstol = tol*(1.0 + normNR0); + int64_t subspace_dim = 0; + if (verbose) + std::cout << "normNR : " << normNR0 << "\tnormR : " << normR0 << "\tk: 0\tdim : 0\n"; + while (subspace_dim < n && k < max_iters) { + // + // Update X and R + // + k++; + + G(layout, s, (T) 1.0, P.data(), n, (T) 0.0, GP.data(), n); + // ^ GP <- G P + blas::gemm( + layout, Op::Trans, Op::NoTrans, s, s, n, 1.0, P.data(), n, GP.data(), n, 0.0, alpha_beta_left_buffer.data(), s + ); // alpha_beta_left_buffer <- P^T G P + if (treat_as_separable) + zero_off_diagonal(alpha_beta_left_buffer.data(), s); + + int64_t subspace_incr = posm_square( + s, alpha_beta_left_buffer, s, alpha, s, more_scratch + ); // alpha <- (alpha_beta_left_buffer)^(-1) alpha + if (treat_as_separable && subspace_incr > 0) + subspace_incr = 1; + + if (subspace_incr < - ((int64_t) s) ) + break; + subspace_dim = subspace_dim + subspace_incr; + + blas::gemm( + layout, Op::NoTrans, Op::NoTrans, n, s, s, 1.0, P.data(), n, alpha.data(), s, 1.0, X.data(), n + ); // X <- X + P alpha + blas::gemm( + layout, Op::NoTrans, Op::NoTrans, n, s, s, -1.0, GP.data(), n, alpha.data(), s, 1.0, R.data(), n + ); // R <- R - GP alpha + + // + // Check termination criteria + // + // TODO: change how we check termination criteria in the event that we're working + // with treat_as_separable = true. + T normR = seminorm(n, s, R.data()); + + N(layout, s, 1.0, R.data(), n, 0.0, NR_or_scratch.data(), n); // NR <- N R + prevnormNR = normNR; + normNR = seminorm(n, s, NR_or_scratch.data()); + if (verbose) + std::cout << "normNR : " << normNR << "\tnormR : " << normR << "\tk: " << k << "\tdim : " << subspace_dim << '\n'; + if (normNR < stop_abstol) + break; + // + // Update P, beta, and alpha + // + alpha_beta_left_buffer = RNR; + blas::gemm( + layout, blas::Op::Trans, blas::Op::NoTrans, s, s, n, 1.0, R.data(), n, NR_or_scratch.data(), n, 0.0, RNR.data(), s + ); // RNR <- R^T NR + if (treat_as_separable) + zero_off_diagonal(RNR.data(), s); + alpha = RNR; + beta = alpha; + int err = posm_square( + s, alpha_beta_left_buffer, s, beta, s, more_scratch + ); // beta <- (alpha_beta_left_buffer)^-1 beta + if (err < - ((int64_t) s)) + break; + blas::gemm( + layout, Op::NoTrans, Op::NoTrans, n, s, s, 1.0, P.data(), n, beta.data(), s, 1.0, NR_or_scratch.data(), n + ); // NR_or_scratch <- P * beta + P = NR_or_scratch; + } + return; +} + + } // end namespace RandLAPACK diff --git a/RandLAPACK/comps/rl_preconditioners.hh b/RandLAPACK/comps/rl_preconditioners.hh index 4f2e9567..13bd7da0 100644 --- a/RandLAPACK/comps/rl_preconditioners.hh +++ b/RandLAPACK/comps/rl_preconditioners.hh @@ -3,11 +3,14 @@ #include "rl_blaspp.hh" #include "rl_lapackpp.hh" #include "rl_util.hh" +#include "rl_linops.hh" +#include "rl_pdkernels.hh" + #include "rl_orth.hh" #include "rl_syps.hh" #include "rl_syrf.hh" #include "rl_revd2.hh" -#include "rl_linops.hh" +#include "rl_rpchol.hh" #include #include @@ -141,15 +144,11 @@ RandBLAS::RNGState rpc_data_svd_saso( T *sigma_sk, //buffer of size at least n. RandBLAS::RNGState state ) { - RandBLAS::SparseDist D{ - .n_rows = d, - .n_cols = m, - .vec_nnz = k - }; + RandBLAS::SparseDist D(d, m, k, RandBLAS::Axis::Short); RandBLAS::SparseSkOp S(D, state); - auto next_state = RandBLAS::fill_sparse(S); + RandBLAS::fill_sparse(S); rpc_data_svd(layout, m, n, A, lda, S, V_sk, sigma_sk); - return next_state; + return S.next_state; } /** @@ -196,19 +195,23 @@ int64_t make_right_orthogonalizer( int64_t n, T* V, T* sigma, - T mu + T mu, + int64_t cols_V = -1 ) { + if (cols_V < 0) { + cols_V = n; + } double sqrtmu = std::sqrt((double) mu); auto regularized = [sqrtmu](T s) { return (sqrtmu == 0) ? s : (T) std::hypot((double) s, sqrtmu); }; T curr_s = regularized(sigma[0]); - T abstol = curr_s * n * std::numeric_limits::epsilon(); + T abstol = curr_s * cols_V * std::numeric_limits::epsilon(); int64_t rank = 0; int64_t inter_col_stride = (layout == Layout::ColMajor) ? n : 1; - int64_t intra_col_stride = (layout == Layout::ColMajor) ? 1 : n; - while (rank < n) { + int64_t intra_col_stride = (layout == Layout::ColMajor) ? 1 : cols_V; + while (rank < cols_V) { curr_s = regularized(sigma[rank]); if (curr_s < abstol) break; @@ -277,7 +280,7 @@ int64_t make_right_orthogonalizer( */ template RandBLAS::RNGState nystrom_pc_data( - SymmetricLinearOperator &A, + linops::SymmetricLinearOperator &A, std::vector &V, std::vector &eigvals, int64_t &k, @@ -314,8 +317,8 @@ RandBLAS::RNGState nystrom_pc_data( * This wraps a function of the same name that accepts a SymmetricLinearOperator object. * The purpose of this wrapper is just to define such an object from data (uplo, A, m). */ -template -RandBLAS::RNGState nystrom_pc_data( +template +STATE nystrom_pc_data( Uplo uplo, const T* A, int64_t m, @@ -323,13 +326,78 @@ RandBLAS::RNGState nystrom_pc_data( std::vector &eigvals, int64_t &k, T mu_min, - RandBLAS::RNGState state, + STATE state, int64_t num_syps_passes = 3, int64_t num_steps_power_iter_error_est = 10 ) { - ExplicitSymLinOp A_linop(m, uplo, A, m, Layout::ColMajor); + linops::ExplicitSymLinOp A_linop(m, uplo, A, m, Layout::ColMajor); return nystrom_pc_data(A_linop, V, eigvals, k, mu_min, state, num_syps_passes, num_steps_power_iter_error_est); } +/** + * TODO: make an overload of rpchol_pc_data that omits "n" and assumes A implements + * some linear operator interface. + */ + +template +STATE rpchol_pc_data( + int64_t n, FUNC &A_stateless, int64_t &k, int64_t b, T* V, T* eigvals, STATE state +) { + std::vector selection(k, -1); + state = RandLAPACK::rp_cholesky(n, A_stateless, k, selection.data(), V, b, state); + // ^ A_stateless \approx VV'; need to convert VV' into its eigendecomposition. + std::vector work(k*k, 0.0); + lapack::gesdd(lapack::Job::OverwriteVec, n, k, V, n, eigvals, nullptr, 1, work.data(), k); + // V has been overwritten with its (nontrivial) left singular vectors + for (int64_t i = 0; i < k; ++i) + eigvals[i] = std::pow(eigvals[i], 2); + return state; +} + + +/** + * V is a buffer for an n-by-k matrix in column-major format. + * + * We implicitly have our hands on an n-by-n matrix A = F F' where + * F is n-by-k and defined in terms of (V, eigvals, use_eigvals). + * If use_eigvals = true, then + * F = V * diag(sqrt(eigvals)), V is column-orthonormal, and + * eigvals contains positive numbers sorted in decreasing order. + * Otherwise, + * F = V and we ignore the values of "eigvals" passed as input. + * + * upper_tri is a buffer for an n-by-n upper-triangular matrix in + * column-major format. It implicitly defines a matrix + * + * A_conj = inv(upper_tri)' A inv(upper_tri) + * + * This function overwrites (V, eigvals) with the eigenvectors and + * eigenvalues of A_conj, where eigenvalues are sorted in decreasing + * order. + **/ +template +void ut_conjugate_spectral_pc_data( + int64_t n, int64_t k, T* V, T* eigvals, const T* upper_tri, std::vector &work, bool use_eigvals +) { + // Step 1: Get our hands on F so that A = FF'. + if (use_eigvals) { + for (int i = 0; i < k; ++i) { + blas::scal(n, (T) std::pow(eigvals[i], (T) 0.5), V + i*n, 1); + } + } + // Step 2: Overwrite F = inv(upper_tri)'F. + // In BLAS terms, we solve trans(upper_tri) X = F, and store X by overwriting F. + blas::trsm(blas::Layout::ColMajor, blas::Side::Left, blas::Uplo::Upper, blas::Op::Trans, blas::Diag::NonUnit, n, k, 1.0, upper_tri, n, V, n); + // Step 3: Call GESDD: overwrite F with its left + // singular vectors and overwrite eigvals + // with its squared singular values. + if (work.size() < k*k) + work.resize(k*k); + lapack::gesdd(lapack::Job::OverwriteVec, n, k, V, n, eigvals, nullptr, 1, work.size(), k); + for (int i = 0; i < k; ++i) { + eigvals[i] = std::pow(eigvals[i], 2.0); + } + return; +} } // end namespace RandLAPACK diff --git a/RandLAPACK/comps/rl_rpchol.hh b/RandLAPACK/comps/rl_rpchol.hh new file mode 100644 index 00000000..7ac17f2d --- /dev/null +++ b/RandLAPACK/comps/rl_rpchol.hh @@ -0,0 +1,205 @@ +#pragma once + +#include "rl_lapackpp.hh" +#include +#include +#include +#include +#include + +namespace RandLAPACK { + +namespace _rpchol_impl { + +using std::vector; +using blas::Layout; + +template +void compute_columns( + Layout layout, int64_t N, FUNC_T &K_stateless, vector &col_indices, T* buff +) { + randblas_require(layout == Layout::ColMajor); + int64_t num_cols = col_indices.size(); + #pragma omp parallel for collapse(2) + for (int64_t ell = 0; ell < num_cols; ++ell) { + for (int64_t i = 0; i < N; ++i) { + int64_t j = col_indices[ell]; + buff[i + ell*N] = K_stateless(i, j); + } + } + return; +} + +template +void pack_selected_rows( + Layout layout, int64_t rows_mat, int64_t cols_mat, T* mat, vector &row_indices, T* submat +) { + randblas_require(layout == Layout::ColMajor); + int64_t num_rows = row_indices.size(); + for (int64_t i = 0; i < num_rows; ++i) { + blas::copy(cols_mat, mat + row_indices[i], rows_mat, submat + i, num_rows); + } + return; +} + +template +int downdate_d_and_cdf(Layout layout, int64_t N, vector &indices, T* F_panel, vector &d, vector &cdf) { + randblas_require(layout == Layout::ColMajor); + int64_t cols_F_panel = indices.size(); + for (int64_t j = 0; j < cols_F_panel; ++j) { + for (int64_t i = 0; i < N; ++i) { + T val = F_panel[i + j*N]; + d[i] -= val*val; + } + } + // Then, to accound for the possibility of rounding errors, manually zero-out everything in "indices." + for (auto i : indices) + d[i] = 0.0; + cdf = d; + try { + RandBLAS::weights_to_cdf(N, cdf.data()); + } catch(RandBLAS::Error &e) { + std::string message{e.what()}; + if (message.find("sum >=") != std::string::npos) { + return 1; + } else if (message.find("val >= error_if_below") != std::string::npos) { + return 2; + } + } + return 0; +} + +} // end namespace RandLAPACK::_rpchol_impl + +/*** + * Computes a rank-k approximation of an implicit n-by-n matrix whose (i,j)^{th} + * entry is A_stateless(i,j), where A_stateless is a stateless function. We build + * the approximation iteratively and increase the rank by at most "b" at each iteration. + * + * Implements Algorithm 4 from https://arxiv.org/abs/2304.12465. + * + * Here's example code where the implict matrix is given by a squared exponential kernel: + * + * // Assume we've already defined ... + * // X : a rows_x by cols_x double-precision matrix (suitably standardized) + * // where each column defines a datapoint. + * // bandwidth : scale for the squared exponential kernel + * + * auto A = [X, rows_x, cols_x, bandwidth](int64_t i, int64_t j) { + * double out = 0; + * double* Xi = X + i*rows_x; + * double* Xj = X + j*rows_x; + * for (int64_t ell = 0; ell < rows_x) { + * double val = (Xi[ell] - Xj[ell]) / (std::sqrt(2)*bandwidth); + * out += val*val; + * } + * out = std::exp(out); + * return out; + * }; + * std::vector F(rows_x*k, 0.0); + * std::vector selection(k); + * RandBLAS::RNGState state_in(0); + * auto state_out = rp_cholesky(cols_x, A, k, selection.data(), F.data(), 64, state_in); + * + * Notes + * ----- + * Compare to + * https://github.com/eepperly/Robust-randomized-preconditioning-for-kernel-ridge-regression/blob/main/code/choleskybase.m + * + */ +template +STATE rp_cholesky(int64_t n, FUNC_T &A_stateless, int64_t &k, int64_t* S, T* F, int64_t b, STATE state, CALLBACK &cb) { + // TODO: make this function robust to rank-deficient matrices. + using RandBLAS::sample_indices_iid; + using RandBLAS::weights_to_cdf; + using blas::Op; + using blas::Uplo; + auto layout = blas::Layout::ColMajor; + auto uplo = blas::Uplo::Upper; + + std::vector work_mat(b*k, 0.0); + std::vector d(n, 0.0); + std::vector cdf(n); + + std::vector Sprime{}; + + for (int64_t i = 0; i < n; ++i) + d[i] = A_stateless(i,i); + cdf = d; + weights_to_cdf(n, cdf.data()); + int w_status = 0; + int64_t ell = 0; + while (ell < k) { + if (w_status) { + std::cout << "weights_to_cdf failed with exit code " << w_status << ".\n"; + std::cout << "Returning early, with approximation rank = " << ell << "\n\n"; + k = ell; + cb(k); + return state; + } + // + // 1. Compute the next block of column indices + // + int64_t curr_B = std::min(b, k - ell); + Sprime.resize(curr_B); + state = sample_indices_iid(n, cdf.data(), curr_B, Sprime.data(), state); + std::sort( Sprime.begin(), Sprime.end() ); + Sprime.erase( unique( Sprime.begin(), Sprime.end() ), Sprime.end() ); + int64_t ell_incr = Sprime.size(); + + // + // 2. Compute F_panel: the next block of ell_incr columns in F. + // + T* F_panel = F + ell*n; + // + // 2.1. Overwrite F_panel with the matrix "G" from Line 5 of [arXiv:2304.12465, Algorithm 4]. + // + // First we compute a submatrix of columns of A and then we downdate with GEMM. + // The downdate is delicate since the output matrix shares a buffer with one of the + // input matrices, but it's okay since they're non-overlapping regions of that buffer. + // + _rpchol_impl::compute_columns(layout, n, A_stateless, Sprime, F_panel); + // ^ F_panel = A(:, Sprime). + _rpchol_impl::pack_selected_rows(layout, n, ell, F, Sprime, work_mat.data()); + // ^ work_mat is a copy of F(Sprime, 1:ell). + blas::gemm( + layout, Op::NoTrans, Op::Trans, n, ell_incr, ell, + -1.0, F, n, work_mat.data(), ell_incr, 1.0, F_panel, n + ); + // + // 2.2. Execute Lines 6 and 7 of [arXiv:2304.12465, Algorithm 4]. + // + _rpchol_impl::pack_selected_rows(layout, n, ell_incr, F_panel, Sprime, work_mat.data()); + int c_status = lapack::potrf(uplo, ell_incr, work_mat.data(), ell_incr); + if (c_status) { + std::cout << "Cholesky failed with exit code " << c_status << ".\n"; + std::cout << "Returning early, with approximation rank = " << ell << "\n\n"; + k = ell; + cb(k); + return state; + } + blas::trsm( + layout, blas::Side::Right, uplo, Op::NoTrans, blas::Diag::NonUnit, + n, ell_incr, 1.0, work_mat.data(), ell_incr, F_panel, n + ); + + // + // 3. Update S, d, cdf and ell. + // + std::copy(Sprime.begin(), Sprime.end(), S + ell); + w_status = _rpchol_impl::downdate_d_and_cdf(layout, n, Sprime, F_panel, d, cdf); + ell = ell + ell_incr; + } + cb(k); + return state; +} + +template +STATE rp_cholesky(int64_t n, FUNC_T &A_stateless, int64_t &k, int64_t* S, T* F, int64_t b, STATE state) { + auto cb = [](int64_t i) { return i ;}; + rp_cholesky(n, A_stateless, k, S, F, b, state, cb); + return state; +} + + +} diff --git a/RandLAPACK/comps/rl_rs.hh b/RandLAPACK/comps/rl_rs.hh index 978be5c5..c4437057 100644 --- a/RandLAPACK/comps/rl_rs.hh +++ b/RandLAPACK/comps/rl_rs.hh @@ -132,11 +132,11 @@ int RS::call( if (p % 2 == 0) { // Fill n by k Omega RandBLAS::DenseDist D(n, k); - state = RandBLAS::fill_dense(D, Omega, state).second; + state = RandBLAS::fill_dense(D, Omega, state); } else { // Fill m by k Omega_1 RandBLAS::DenseDist D(m, k); - state = RandBLAS::fill_dense(D, Omega_1, state).second; + state = RandBLAS::fill_dense(D, Omega_1, state); // multiply A' by Omega results in n by k omega blas::gemm(Layout::ColMajor, Op::Trans, Op::NoTrans, n, k, m, 1.0, A, m, Omega_1, m, 0.0, Omega, n); diff --git a/RandLAPACK/comps/rl_syps.hh b/RandLAPACK/comps/rl_syps.hh index e1d42a2a..e071bf9e 100644 --- a/RandLAPACK/comps/rl_syps.hh +++ b/RandLAPACK/comps/rl_syps.hh @@ -31,7 +31,7 @@ class SymmetricPowerSketch { ) = 0; virtual int call( - SymmetricLinearOperator &A, + linops::SymmetricLinearOperator &A, int64_t k, RandBLAS::RNGState &state, T* &skop_buff = nullptr, @@ -108,7 +108,7 @@ class SYPS : public SymmetricPowerSketch { ); int call( - SymmetricLinearOperator &A, + linops::SymmetricLinearOperator &A, int64_t k, RandBLAS::RNGState &state, T* &skop_buff, @@ -125,7 +125,7 @@ class SYPS : public SymmetricPowerSketch { // ----------------------------------------------------------------------------- template int SYPS::call( - SymmetricLinearOperator &A, + linops::SymmetricLinearOperator &A, int64_t k, RandBLAS::RNGState &state, T* &skop_buff, @@ -140,12 +140,12 @@ int SYPS::call( if (!callers_skop_buff) skop_buff = new T[m * k]; RandBLAS::DenseDist D(m, k); - state = RandBLAS::fill_dense(D, skop_buff, state).second; + state = RandBLAS::fill_dense(D, skop_buff, state); bool callers_work_buff = work_buff != nullptr; if (!callers_work_buff) work_buff = new T[m * k]; - RandBLAS::util::safe_scal(m * k, 0.0, work_buff, 1); + RandBLAS::util::safe_scal(m * k, (T) 0.0, work_buff, 1); T *symm_out = work_buff; T *symm_in = skop_buff; @@ -166,8 +166,6 @@ int SYPS::call( if (p % 2 == 1) blas::copy(m * k, work_buff, 1, skop_buff, 1); - RandBLAS::DenseSkOp(D, state, skop_buff); - if (!callers_work_buff) delete[] work_buff; @@ -186,7 +184,7 @@ int SYPS::call( T* &skop_buff, T* work_buff ) { - ExplicitSymLinOp A_linop(m, uplo, A, lda, Layout::ColMajor); + linops::ExplicitSymLinOp A_linop(m, uplo, A, lda, Layout::ColMajor); return call(A_linop, k, state, skop_buff, work_buff); } diff --git a/RandLAPACK/comps/rl_syrf.hh b/RandLAPACK/comps/rl_syrf.hh index ab40d79e..246dd2de 100644 --- a/RandLAPACK/comps/rl_syrf.hh +++ b/RandLAPACK/comps/rl_syrf.hh @@ -22,7 +22,7 @@ class SymmetricRangeFinder { virtual ~SymmetricRangeFinder() {} virtual int call( - SymmetricLinearOperator &A, + linops::SymmetricLinearOperator &A, int64_t k, std::vector &Q, RandBLAS::RNGState &state, @@ -94,7 +94,7 @@ class SYRF : public SymmetricRangeFinder { ) override; int call( - SymmetricLinearOperator &A, + linops::SymmetricLinearOperator &A, int64_t k, std::vector &Q, RandBLAS::RNGState &state, @@ -116,7 +116,7 @@ class SYRF : public SymmetricRangeFinder { // ----------------------------------------------------------------------------- template int SYRF::call( - SymmetricLinearOperator &A, + linops::SymmetricLinearOperator &A, int64_t k, std::vector &Q, RandBLAS::RNGState &state, @@ -127,13 +127,13 @@ int SYRF::call( if (!callers_work_buff) work_buff = new T[m * k]; - RandBLAS::util::safe_scal(m * k, 0.0, work_buff, 1); + RandBLAS::util::safe_scal(m * k, (T) 0.0, work_buff, 1); T* Q_dat = util::upsize(m * k, Q); SYPS_Obj.call(A, k, state, work_buff, Q_dat); // Q = orth(A * Omega) - A(Layout::ColMajor, k, 1.0, work_buff, m, 0.0, Q_dat, m); + A(Layout::ColMajor, k, (T) 1.0, work_buff, m, (T) 0.0, Q_dat, m); if(this->cond_check) { util::upsize(m * k, this->cond_work_mat); util::upsize(k, this->cond_work_vec); @@ -161,7 +161,7 @@ int SYRF::call( RandBLAS::RNGState &state, T* work_buff ) { - ExplicitSymLinOp A_linop(m, uplo, A, m, Layout::ColMajor); + linops::ExplicitSymLinOp A_linop(m, uplo, A, m, Layout::ColMajor); return this->call(A_linop, k, Q, state, work_buff); } diff --git a/RandLAPACK/drivers/rl_cqrrp.hh b/RandLAPACK/drivers/rl_cqrrp.hh index bfdf3d18..8efcba40 100644 --- a/RandLAPACK/drivers/rl_cqrrp.hh +++ b/RandLAPACK/drivers/rl_cqrrp.hh @@ -298,7 +298,7 @@ int CQRRP_blocked::call( // as LU is not intended to be used with rank-deficient matrices. T* S = ( T * ) calloc( d * m, sizeof( T ) ); RandBLAS::DenseDist D(d, m); - state = RandBLAS::fill_dense(D, S, state).second; + state = RandBLAS::fill_dense(D, S, state); blas::gemm(Layout::ColMajor, Op::NoTrans, Op::NoTrans, d, n, m, 1.0, S, d, A, m, 0.0, A_sk, d); free(S); diff --git a/RandLAPACK/drivers/rl_cqrrpt.hh b/RandLAPACK/drivers/rl_cqrrpt.hh index 360a8143..fed8038b 100644 --- a/RandLAPACK/drivers/rl_cqrrpt.hh +++ b/RandLAPACK/drivers/rl_cqrrpt.hh @@ -196,9 +196,10 @@ int CQRRPT::call( saso_t_start = high_resolution_clock::now(); /// Generating a SASO - RandBLAS::SparseDist DS = {.n_rows = d, .n_cols = m, .vec_nnz = this->nnz}; + RandBLAS::SparseDist DS(d, m, this->nnz); RandBLAS::SparseSkOp S(DS, state); - state = RandBLAS::fill_sparse(S); + RandBLAS::fill_sparse(S); + state = S.next_state; /// Applying a SASO RandBLAS::sketch_general( diff --git a/RandLAPACK/drivers/rl_hqrrp.hh b/RandLAPACK/drivers/rl_hqrrp.hh index b84a34f2..d565cd9d 100644 --- a/RandLAPACK/drivers/rl_hqrrp.hh +++ b/RandLAPACK/drivers/rl_hqrrp.hh @@ -345,18 +345,16 @@ int64_t NoFLA_Apply_Q_WY_lhfc_blk_var4( // ============================================================================ template int64_t NoFLA_QRP_compute_norms( - int64_t m_A, int64_t n_A, T * buff_A, int64_t ldim_A, - T * buff_d, T * buff_e ) { + int64_t m_A, int64_t n_A, T * buff_A, int64_t ldim_A, T * buff_d, T * buff_e +) { // // It computes the column norms of matrix A. The norms are stored int64_to // vectors d and e. // - int64_t j, i_one = 1; // Main loop. - //#pragma omp parallel for - for( j = 0; j < n_A; j++ ) { - * buff_d = blas::nrm2(m_A, buff_A, i_one); + for(int64_t j = 0; j < n_A; j++ ) { + * buff_d = blas::nrm2(m_A, buff_A, 1); * buff_e = * buff_d; buff_A += ldim_A; buff_d++; @@ -560,8 +558,8 @@ static int64_t CHOLQR_mod_WY( // Entries of tau will be placed on the main diagonal of matrix T from orhr_col(). for(i = 0; i < n_A; ++i) buff_t[i] = buff_T[(ldim_T + 1) * i]; - #endif return 0; + #endif } // ============================================================================ @@ -934,8 +932,8 @@ int64_t hqrrp( } // Initialize matrices G and Y. - RandBLAS::DenseDist D(nb_alg + pp, m_A, RandBLAS::DenseDistName::Uniform); - state = RandBLAS::fill_dense(D, buff_G, state).second; + RandBLAS::DenseDist D(nb_alg + pp, m_A, RandBLAS::ScalarDist::Uniform); + state = RandBLAS::fill_dense(D, buff_G, state); blas::gemm(Layout::ColMajor, Op::NoTrans, Op::NoTrans, m_Y, n_Y, m_A, diff --git a/RandLAPACK/drivers/rl_krillx.hh b/RandLAPACK/drivers/rl_krillx.hh new file mode 100644 index 00000000..9e0d2211 --- /dev/null +++ b/RandLAPACK/drivers/rl_krillx.hh @@ -0,0 +1,162 @@ +#pragma once + +#include "rl_blaspp.hh" +#include "rl_lapackpp.hh" +#include "rl_linops.hh" +#include "rl_preconditioners.hh" +#include "rl_rpchol.hh" +#include "rl_pdkernels.hh" +#include "rl_determiter.hh" + +#include +#include +#include + +/** + * + * TODO: + * (1) finish and test krill_restricted_rpchol + * (2) write and test a krill_restricted function that accepts the centers as inputs + * in advance. + * (3) See also, rl_preconditioners.hh + * + */ + +namespace RandLAPACK { + +/** + * Fun thing about the name KRILLx: + * + * we can do KRILLrs for KRILL with lockstep PCG for regularization sweep. + * + * we can do KRILLb (?) for "random lifting + block" version. + */ + +using std::vector; + +template +STATE krill_full_rpchol( + int64_t n, FUNC &G, vector &H, vector &X, T tol, + STATE state, SEMINORM seminorm, int64_t rpchol_block_size = -1, int64_t max_iters = 20, int64_t k = -1 +) { + auto mus = G.regs; + int64_t ell = ((int64_t) H.size()) / n; + randblas_require(ell * n == (int64_t) H.size()); + int64_t mu_size = mus.size(); + randblas_require(mu_size == 1 || mu_size == ell); + + if (rpchol_block_size < 0) + rpchol_block_size = std::min((int64_t) 64, n/4); + if (k < 0) + k = (int64_t) std::sqrt(n); + + vector V(n*k, 0.0); + vector eigvals(k, 0.0); + G.set_eval_includes_reg(false); + state = rpchol_pc_data(n, G, k, rpchol_block_size, V.data(), eigvals.data(), state); + linops::SpectralPrecond invP(n); + invP.prep(V, eigvals, mus, ell); + G.set_eval_includes_reg(true); + lockorblock_pcg(G, H, tol, max_iters, invP, seminorm, X, true); + + return state; +} + +/** + * We start with a regularized kernel linear operator G and target data H. + * We use "K" to denote the unregularized version of G, which can be accessed + * by calling G.set_eval_includes_reg(false); + * + * If G.regs.size() == 1, then the nominal KRR problem reduces to computing + * + * (K + G.regs[0] * I) X = H. (*) + * + * If G.regs.size() > 1, then KRR is nominally about solving the independent + * collection of problems + * + * (K + mu_i * I) x_i = h_i, (**) + * + * where K is the unregularized version of G, mu_i = G.regs[i], and x_i, h_i + * are the i-th columns of X and H respectively. In this situation we need + * H to have exactly G.regs.size() columns. + * + * This function produces __approximate__ solutions to KRR problems. It does so + * by finding a set of indices for which + * + * K_hat = K(:,inds) * inv(K(inds, inds)) * K(inds, :) + * + * is a good low-rank approximation of K. We spend O(n*k^2) arithmetic operations and + * O(n*k_ evaluations of K(i,j) to get our hands on "inds" and a factored representation + * of K_hat. + * + * Given inds, we turn our attention to solving the problem + * + * min{ || K(:,inds) x - H ||_2^2 + mu || sqrtm(K(inds, inds)) x ||_2^2 : x }. + * + * We don't store K(:,inds) explicitly. Instead, we have access to a matrix V where + * + * (i) K_hat = VV', + * (ii) V(inds,:)V(inds,:)' = K(inds, inds), and + * (iii) V*V(inds,:)' = K_hat(:,inds) = K(:, inds). + * + * If we abbreviate M := V(inds, :), then the restricted KRR problem can be framed as + * + * min{ || V M' x - H ||_2^2 + mu || M' X ||_2^2 : x }. + * + * We approach this by a change of basis, solving problems like + * + * min{ ||V y - H||_2^2 + mus || y ||_2^2 : y } (***) + * + * and then returning x = inv(M') y. + * + * Note that since we spend O(n*k^2) time getting our hands on V and inds, it would be + * reasonable to spend O(n*k^2) additional time to solve (***) by a direct method. + * However, it is easy enough to reduce the cost of solving (***) to o(n*k^2) + * (that is, little-o of n*k^2) by a sketch and precondition approach. + * + */ +template +STATE krill_restricted_rpchol( + int64_t n, FUNC &G, vector &H, vector &X, T tol, + STATE state, SEMINORM seminorm, int64_t rpchol_block_size = -1, int64_t max_iters = 20, int64_t k = -1 +) { + // NOTE: on entry, X is n-by-s for some integer s. That's way bigger than it needs to be, since the + // solution we return can be written down with k*s nonzeros plus k indices to indicate which rows of X + // are nonzero. + vector V(n*k, 0.0); + vector eigvals(k, 0.0); + G.set_eval_includes_reg(false); + + vector inds(k, -1); + state = rp_cholesky(n, G, k, inds.data(), V.data(), rpchol_block_size, state); + inds.resize(k); + // ^ VV' defines a rank-k Nystrom approximation of G. The approximation satisfies + // + // VV' = G(:,inds) * inv(G(inds, inds)) * G(inds, :) + // and + // (VV')(inds, inds) = G(inds, inds). + // + // That second identity can be written as MM' = G(inds, inds) for M = V(inds, :). + // + + + vector M(k * k); + _rpchol_impl::pack_selected_rows(blas::Layout::ColMajor, n, k, V.data(), inds, M.data()); + // + // + // + + linops::SpectralPrecond invP(n); + // invP.prep(V, eigvals, mus, ell); + return state; +} + +// template +// STATE krill_block( +// +// ) { +// +// } + + +} // end namespace RandLAPACK diff --git a/RandLAPACK/drivers/rl_rbki.hh b/RandLAPACK/drivers/rl_rbki.hh index 9c227422..85cf6480 100644 --- a/RandLAPACK/drivers/rl_rbki.hh +++ b/RandLAPACK/drivers/rl_rbki.hh @@ -248,7 +248,7 @@ int RBKI::call( omp_set_num_threads(this->num_threads_some); #endif RandBLAS::DenseDist D(n, k); - state = RandBLAS::fill_dense(D, Y_i, state).second; + state = RandBLAS::fill_dense(D, Y_i, state); #if RandLAPACK_HAS_OpenMP omp_set_num_threads(this->num_threads_rest); #endif diff --git a/RandLAPACK/drivers/rl_revd2.hh b/RandLAPACK/drivers/rl_revd2.hh index e5eee65f..bf6d49e9 100644 --- a/RandLAPACK/drivers/rl_revd2.hh +++ b/RandLAPACK/drivers/rl_revd2.hh @@ -31,7 +31,7 @@ class REVD2alg { ) = 0; virtual int call( - SymmetricLinearOperator &A, + linops::SymmetricLinearOperator &A, int64_t &k, T tol, std::vector &V, @@ -98,7 +98,7 @@ class REVD2 : public REVD2alg { ) override; int call( - SymmetricLinearOperator &A, + linops::SymmetricLinearOperator &A, int64_t &k, T tol, std::vector &V, @@ -127,7 +127,7 @@ class REVD2 : public REVD2alg { /// All other parameters come from REVD2 template T power_error_est( - SymmetricLinearOperator &A, + linops::SymmetricLinearOperator &A, int64_t k, int p, T* vector_buf, @@ -176,7 +176,7 @@ T power_error_est( template int REVD2::call( - SymmetricLinearOperator &A, + linops::SymmetricLinearOperator &A, int64_t &k, T tol, std::vector &V, @@ -252,8 +252,8 @@ int REVD2::call( // Using the first column of Omega as a buffer for a random vector // To perform the following safely, need to make sure Omega has at least 4 columns Omega_dat = util::upsize(m * 4, this->Omega); - RandBLAS::DenseDist g(m, 1); - error_est_state = RandBLAS::fill_dense(g, Omega_dat, error_est_state).second; + RandBLAS::DenseDist g(m, 1); + error_est_state = RandBLAS::fill_dense(g, Omega_dat, error_est_state); err = power_error_est(A, k, this->error_est_p, Omega_dat, V_dat, Y_dat, eigvals.data()); @@ -279,7 +279,7 @@ int REVD2::call( std::vector &eigvals, RandBLAS::RNGState &state ) { - ExplicitSymLinOp A_linop(m, uplo, A, m, Layout::ColMajor); + linops::ExplicitSymLinOp A_linop(m, uplo, A, m, Layout::ColMajor); return this->call(A_linop, k, tol, V, eigvals, state); } diff --git a/RandLAPACK/misc/rl_gen.hh b/RandLAPACK/misc/rl_gen.hh index 1068ffad..bdc2f4a4 100644 --- a/RandLAPACK/misc/rl_gen.hh +++ b/RandLAPACK/misc/rl_gen.hh @@ -30,35 +30,31 @@ enum mat_type { /// A struct containing info about a given matrix to be generated by mat_gen(). /// Requires only the size and type of a matrix by default, but can have other optional parameters. +/// +/// We set defaults in the member declaration in case people try to struct-initialize this type. +/// template struct mat_gen_info { int64_t rows; int64_t cols; int64_t rank; mat_type m_type; - T cond_num; - T scaling; - T exponent; - bool diag; - bool check_true_rank; - T theta; - T perturb; + T cond_num = 1.0; + T scaling = 1.0; + T exponent = 1.0; + bool diag = false; + bool check_true_rank = false; + T theta = 1.0; + T perturb = 1.0; char* filename; int workspace_query_mod; + T frac_spectrum_one = 0.1; mat_gen_info(int64_t& m, int64_t& n, mat_type t) { rows = m; cols = n; m_type = t; - /// default values - diag = false; - rank = n; - cond_num = 1.0; - scaling = 1.0; - exponent = 1.0; - theta = 1.0; - perturb = 1.0; - check_true_rank = false; + rank = n; // <-- default value. } }; @@ -80,8 +76,8 @@ void gen_singvec( RandBLAS::DenseDist DU(m, k); RandBLAS::DenseDist DV(n, k); - state = RandBLAS::fill_dense(DU, U, state).second; - state = RandBLAS::fill_dense(DV, V, state).second; + state = RandBLAS::fill_dense(DU, U, state); + state = RandBLAS::fill_dense(DV, V, state); blas::copy(k, S, k + 1, A, m + 1); @@ -98,8 +94,8 @@ void gen_singvec( /// Generates a matrix with polynomially-decaying spectrum of the following form: /// s_i = a(i + b)^p, where p is the user-defined exponent constant, a and b are computed -/// using p and the user-defined condition number parameter and the first 10 percent of the -/// singular values are equal to one. +/// using p and the user-defined condition number parameter and the first +/// (100 * frac_spectrum_one) percent of the singular values are equal to one. /// User can optionally choose for the matrix to be diagonal. /// The output matrix has k singular values. template @@ -108,6 +104,7 @@ void gen_poly_mat( int64_t &n, T* A, int64_t k, + T frac_spectrum_one, T cond, T p, bool diagon, @@ -119,11 +116,12 @@ void gen_poly_mat( T* S = ( T * ) calloc( k * k, sizeof( T ) ); // The first 10% of the singular values will be equal to one - int offset = (int) floor(k * 0.1); + int offset = (int) floor(k * frac_spectrum_one); T first_entry = 1.0; T last_entry = first_entry / cond; - T a = std::pow((std::pow(last_entry, -1 / p) - std::pow(first_entry, -1 / p)) / (k - offset), p); - T b = std::pow(a * first_entry, -1 / p) - offset; + T neg_invp = -((T)1.0)/p; + T a = std::pow((std::pow(last_entry, neg_invp) - std::pow(first_entry, neg_invp)) / (k - offset), p); + T b = std::pow(a * first_entry, neg_invp) - offset; // apply lambda function to every entry of s std::fill(s, s + offset, 1.0); for (int i = offset; i < k; ++i) { @@ -243,16 +241,15 @@ void gen_spiked_mat( ) { int64_t num_rows_sampled = n / 2; - /// sample from [m] without replacement. Get the row indices for a tall LASO with a single column. - RandBLAS::SparseDist DS = {.n_rows = m, .n_cols = 1, .vec_nnz = num_rows_sampled, .major_axis = RandBLAS::MajorAxis::Long}; - RandBLAS::SparseSkOp S(DS, state); - state = RandBLAS::fill_sparse(S); + /// sample from [m] without replacement + int64_t* rows = new int64_t[num_rows_sampled]{}; + state = RandBLAS::repeated_fisher_yates(num_rows_sampled, m, 1, rows, state); T* V = ( T * ) calloc( n * n, sizeof( T ) ); T* tau = ( T * ) calloc( n, sizeof( T ) ); RandBLAS::DenseDist DV(n, n); - state = RandBLAS::fill_dense(DV, V, state).second; + state = RandBLAS::fill_dense(DV, V, state); lapack::geqrf(n, n, V, n, tau); lapack::ungqr(n, n, n, V, n, tau); @@ -269,11 +266,12 @@ void gen_spiked_mat( for (i = 0; i < n; ++ i) { for (j = 0; j < num_rows_sampled; ++j) { - A[m * i + S.rows[j]] *= spike_scale; + A[m * i + rows[j]] *= spike_scale; } j = 0; } + delete [] rows; free(V); free(tau); } @@ -304,10 +302,10 @@ void gen_oleg_adversarial_mat( T* tau2 = ( T * ) calloc( n, sizeof( T ) ); RandBLAS::DenseDist DU(m, n); - state = RandBLAS::fill_dense(DU, U, state).second; + state = RandBLAS::fill_dense(DU, U, state); RandBLAS::DenseDist DV(n, n); - state = RandBLAS::fill_dense(DV, V, state).second; + state = RandBLAS::fill_dense(DV, V, state); for(int i = 0; i < n; ++i) { //U_dat[m * i + 1] *= scaling_factor_U; @@ -470,7 +468,7 @@ void mat_gen( switch(info.m_type) { case polynomial: // Generating matrix with polynomially decaying singular values - RandLAPACK::gen::gen_poly_mat(info.rows, info.cols, A, info.rank, info.cond_num, info.exponent, info.diag, state); + RandLAPACK::gen::gen_poly_mat(info.rows, info.cols, A, info.rank, info.frac_spectrum_one, info.cond_num, info.exponent, info.diag, state); break; case exponential: // Generating matrix with exponentially decaying singular values @@ -480,7 +478,7 @@ void mat_gen( case gaussian: { // Gaussian random matrix RandBLAS::DenseDist D(info.rows, info.cols); - state = RandBLAS::fill_dense(D, A, state).second; + state = RandBLAS::fill_dense(D, A, state); } break; case step: { @@ -522,4 +520,12 @@ void mat_gen( break; } } + +template +std::vector mat_gen(mat_gen_info &info, RandBLAS::RNGState &state) { + std::vector A(info.rows * info.cols, 0.0); + mat_gen(info, A.data(), state); + return A; +} + } diff --git a/RandLAPACK/misc/rl_linops.hh b/RandLAPACK/misc/rl_linops.hh index e5a733df..767d5357 100644 --- a/RandLAPACK/misc/rl_linops.hh +++ b/RandLAPACK/misc/rl_linops.hh @@ -11,7 +11,9 @@ #include #include -namespace RandLAPACK { +namespace RandLAPACK::linops { + +using std::vector; template struct SymmetricLinearOperator { @@ -41,31 +43,33 @@ struct SymmetricLinearOperator { int64_t ldc ) = 0; + virtual T operator()(int64_t i, int64_t j) = 0; + virtual ~SymmetricLinearOperator() {} }; template struct ExplicitSymLinOp : public SymmetricLinearOperator { - const Uplo uplo; + const blas::Uplo uplo; const T* A_buff; const int64_t lda; - const Layout buff_layout; + const blas::Layout buff_layout; ExplicitSymLinOp( int64_t m, - Uplo uplo, + blas::Uplo uplo, const T* A_buff, int64_t lda, - Layout buff_layout - ) : SymmetricLinearOperator(m), uplo(uplo), A_buff(A_buff), lda(lda), buff_layout(buff_layout) {}; + blas::Layout buff_layout + ) : SymmetricLinearOperator(m), uplo(uplo), A_buff(A_buff), lda(lda), buff_layout(buff_layout) {} // Note: the "layout" parameter here is interpreted for (B and C). // If layout conflicts with this->buff_layout then we manipulate // parameters to blas::symm to reconcile the different layouts of // A vs (B, C). void operator()( - Layout layout, + blas::Layout layout, int64_t n, T alpha, T* const B, @@ -78,15 +82,241 @@ struct ExplicitSymLinOp : public SymmetricLinearOperator { randblas_require(ldc >= this->m); auto blas_call_uplo = this->uplo; if (layout != this->buff_layout) - blas_call_uplo = (this->uplo == Uplo::Upper) ? Uplo::Lower : Uplo::Upper; + blas_call_uplo = (this->uplo == blas::Uplo::Upper) ? blas::Uplo::Lower : blas::Uplo::Upper; // Reading the "blas_call_uplo" triangle of "this->A_buff" in "layout" order is the same // as reading the "this->uplo" triangle of "this->A_buff" in "this->buff_layout" order. blas::symm( layout, Side::Left, blas_call_uplo, this->m, n, alpha, this->A_buff, this->lda, B, ldb, beta, C, ldc ); - }; + } + + inline T operator()(int64_t i, int64_t j) { + randblas_require(this->uplo == blas::Uplo::Upper && this->buff_layout == blas::Layout::ColMajor); + if (i > j) { + return A_buff[j + i*lda]; + } else { + return A_buff[i + j*lda]; + } + } +}; + +template +struct RegExplicitSymLinOp : public SymmetricLinearOperator { + + const T* A_buff; + const int64_t lda; + vector regs; + bool _eval_includes_reg; + + static const blas::Uplo uplo = blas::Uplo::Upper; + static const blas::Layout buff_layout = blas::Layout::ColMajor; + using scalar_t = T; + + RegExplicitSymLinOp( + int64_t m, const T* A_buff, int64_t lda, vector ®s + ) : SymmetricLinearOperator(m), A_buff(A_buff), lda(lda), regs(regs) { + randblas_require(lda >= m); + _eval_includes_reg = false; + } + + void set_eval_includes_reg(bool eir) { + _eval_includes_reg = eir; + } + + void operator()(blas::Layout layout, int64_t n, T alpha, T* const B, int64_t ldb, T beta, T* C, int64_t ldc) { + randblas_require(layout == this->buff_layout); + randblas_require(ldb >= this->m); + randblas_require(ldc >= this->m); + blas::symm(layout, blas::Side::Left, this->uplo, this->m, n, alpha, this->A_buff, this->lda, B, ldb, beta, C, ldc); + + if (_eval_includes_reg) { + int64_t num_regs = this->regs.size(); + if (num_regs != 1) + randblas_require(n == num_regs); + T* regsp = regs.data(); + for (int64_t i = 0; i < n; ++i) { + T coeff = alpha * regsp[std::min(i, num_regs - 1)]; + blas::axpy(this->m, coeff, B + i*ldb, 1, C + i*ldc, 1); + } + } + return; + } + + inline T operator()(int64_t i, int64_t j) { + T val; + if (i > j) { + val = A_buff[j + i*lda]; + } else { + val = A_buff[i + j*lda]; + } + if (_eval_includes_reg) { + randblas_require(regs.size() == 1); + val += regs[0]; + } + return val; + } + +}; + +template +struct SpectralPrecond { + + public: + using scalar_t = T; + const int64_t m; + int64_t k; + int64_t s; + vector V; + T* V_ptr; + vector D; + T* D_ptr; + vector work; + T* work_ptr; + int64_t num_regs = 1; + + /* Suppose we want to precondition a positive semidefinite matrix G_mu = G + mu*I. + * + * Once properly preparred, this preconditioner represents a linear operator of the form + * P = V diag(D) V' + I. + * The columns of V approximate the top k eigenvectors of G, while the + * entries of D are *functions of* the corresponding approximate eigenvalues. + * + * The specific form of the entries of D are as follows. Suppose we start with + * (V, lambda) as approximations of the top k eigenpairs of G, and define the vector + * D0 = (min(lambda) + mu) / (lambda + mu). + * From a mathematical perspective, this preconditioner represents the linear operator + * P = V diag(D0) V' + (I - VV'). + * The action of this linear operator can be computed with two calls to GEMM + * instead of three if we store D = D0 - 1 instead of D0 itself. + */ + + SpectralPrecond(int64_t m) + : m(m), k(1), s(1), + V(k * m), V_ptr(V.data()), + D(k), D_ptr(D.data()), + work(k * s), work_ptr(work.data()) {} + + // Move constructor + // Call as SpectralPrecond spc(std::move(other)) when we want to transfer the + // contents of "other" to "this". + SpectralPrecond(SpectralPrecond &&other) noexcept + : m(other.m), k(other.k), s(other.s), + V(std::move(other.V)), V_ptr(V.data()), + D(std::move(other.D)), D_ptr(D.data()), + work(std::move(other.work)), work_ptr(work.data()), + num_regs(other.num_regs) {} + + // Copy constructor + // Call as SpectralPrecond spc(other) when we want to copy "other". + SpectralPrecond(const SpectralPrecond &other) + : m(other.m), k(other.k), s(other.s), + V(other.V), V_ptr(V.data()), + D(other.D), D_ptr(D.data()), + work(other.work), work_ptr(work.data()), + num_regs(other.num_regs) {} + + void prep(vector &eigvecs, vector &eigvals, vector &mus, int64_t arg_s) { + // assume eigvals are positive numbers sorted in decreasing order. + num_regs = mus.size(); + randblas_require(num_regs == 1 || num_regs == arg_s); + k = eigvals.size(); + D.resize(k * num_regs); + + s = arg_s; + V = eigvecs; + V_ptr = V.data(); + work.resize(k * s); + work_ptr = work.data(); + + D_ptr = D.data(); + for (int64_t r = 0; r < num_regs; ++r) { + T mu_r = mus[r]; + T* D_r = &D_ptr[r*k]; + T numerator = eigvals[k-1] + mu_r; + for (int i = 0; i < k; ++i) + D_r[i] = (numerator / (eigvals[i] + mu_r)) - 1.0; + } + return; + } + + void evaluate(int64_t s, const T *x, T *dest) { + operator()(blas::Layout::ColMajor, s, (T) 1.0, x, m, (T) 0.0, dest, m); + return; + } + + void operator()( + blas::Layout layout, int64_t n, T alpha, const T* B, int64_t ldb, T beta, T* C, int64_t ldc + ) { + randblas_require(layout == blas::Layout::ColMajor); + randblas_require(ldb >= this->m); + randblas_require(ldc >= this->m); + if (this->num_regs != 1) { + randblas_require(n == num_regs); + } else { + randblas_require(this->s >= n); + } + // update C = alpha*(V diag(D) V' + I)B + beta*C + // Step 1: w = V'B with blas::gemm + // Step 2: w = D w with our own kernel + // Step 3: C = beta * C + alpha * B with blas::copy or blas::scal + blas::axpy + // Step 4: C = alpha * V w + C with blas::gemm + blas::gemm(layout, blas::Op::Trans, blas::Op::NoTrans, k, n, m, (T) 1.0, V_ptr, m, B, ldb, (T) 0.0, work_ptr, k); + + // -----> start step 2 + #define mat_D(_i, _j) ((num_regs == 1) ? D_ptr[(_i)] : D_ptr[(_i) + k*(_j)]) + #define mat_work(_i, _j) work_ptr[(_i) + k*(_j)] + for (int64_t j = 0; j < n; j++) { + for (int64_t i = 0; i < k; i++) { + mat_work(i, j) = mat_D(i, j) * mat_work(i, j); + } + } + #undef mat_D + #undef mat_work + // <----- end step 2 + + // -----> start step 3 + int64_t i; + #define colB(_i) &B[(_i)*ldb] + #define colC(_i) &C[(_i)*ldb] + if (beta == (T) 0.0 && alpha == (T) 1.0) { + for (i = 0; i < n; ++i) + blas::copy(m, colB(i), 1, colC(i), 1); + } else { + for (i = 0; i < n; ++i) { + T* Ci = colC(i); + blas::scal(m, beta, Ci, 1); + blas::axpy(m, alpha, colB(i), 1, Ci, 1); + } + } + #undef colB + #undef colC + // <----- end step 3 + + blas::gemm(layout, blas::Op::NoTrans, blas::Op::NoTrans, m, n, k, (T) 1.0, V_ptr, m, work_ptr, k, 1.0, C, ldc); + return; + } }; +// template +// struct ConjSpectralPrecond { +// public: +// using scalar_t = T; +// SpectralPrecond spectral_precond; +// vector ut_conjugator; + +// ConjSpectralPrecond(SpectralPrecond &sp, std::vector &utc) +// : spectral_precond(sp), ut_conjugator(utc) {} + +// ConjSpectralPrecond(SpectralPrecond &&sp, std::vector &&utc) +// : spectral_precond(std::move(sp)), ut_conjugator(std::move(utc)) {} + +// void operator()( +// blas::Layout layout, int64_t n, T alpha, const T* B, int64_t ldb, T beta, T* C, int64_t ldc +// ) { +// randblas_require(layout == blas::Layout::ColMajor); + +// } +// }; -} // end namespace RandLAPACK +} // end namespace RandLAPACK::linops diff --git a/RandLAPACK/misc/rl_pdkernels.hh b/RandLAPACK/misc/rl_pdkernels.hh new file mode 100644 index 00000000..5c90919d --- /dev/null +++ b/RandLAPACK/misc/rl_pdkernels.hh @@ -0,0 +1,282 @@ +#ifndef randlapack_misc_pdkernels_h +#define randlapack_misc_pdkernels_h + +#include "rl_blaspp.hh" +#include "rl_linops.hh" +#include + +#include +#include +#include +#include +#include +#include + +namespace RandLAPACK { + +/*** + * X is a rows_x by cols_x matrix stored in column major format with + * leading dimension equal to rows_x. Each column of X is interpreted + * as a datapoint in "rows_x" dimensional space. mu and sigma are + * buffers of length rows_x. If use_input_mu_sigma is false then this + * function overwrites them as follows: + * + * mu(i) = [the sample mean of X(i,1), ..., X(i, end) ]. + * + * sigma(i) = [the sample standard deviation of X(i,1), ..., X(i, end) ]. + * + * This function subtracts off a copy of "mu" from each column of X and + * divides each row of X by the corresponding entry of sigma. + * On exit, each row of X has mean 0.0 and sample standard deviation 1.0. + * + */ +template +void standardize_dataset( + int64_t rows_x, int64_t cols_x, T* X, T* mu, T* sigma, bool use_input_mu_sigma = false +) { + randblas_require(cols_x >= 2); + if (! use_input_mu_sigma) { + std::fill(mu, mu + rows_x, (T) 0.0); + std::fill(sigma, sigma + rows_x, (T) 0.0); + } + T* ones_cols_x = new T[cols_x]{1.0}; + blas::gemv(blas::Layout::ColMajor, blas::Op::NoTrans, rows_x, cols_x, 1.0/ (T)rows_x, X, rows_x, ones_cols_x, 1, (T) 0.0, mu, 1); + // ^ Computes the mean + blas::ger(blas::Layout::ColMajor, rows_x, cols_x, -1, mu, 1, ones_cols_x, 1, X, rows_x); + // ^ Performs a rank-1 update to subtract off the mean. + delete [] ones_cols_x; + // Up next: compute the sample standard deviations and rescale each row to have sample stddev = 1. + T stddev_scale = std::sqrt((T) (cols_x - 1)); + for (int64_t i = 0; i < rows_x; ++i) { + sigma[i] = blas::nrm2(cols_x, X + i, rows_x); + sigma[i] /= stddev_scale; + blas::scal(cols_x, (T) 1.0 / sigma[i], X + i, rows_x); + } + return; +} + +/*** + * X is a rows_x by cols_x matrix stored in column major format with + * leading dimension equal to rows_x; sq_colnorms_x is a buffer of + * length "cols_x" whose j-th entry is ||X(:,j)||_2^2. + * + * The Euclidean distance matrix induced by X has entries + * + * E(i,j) = ||X(:,i) - X(:, J)||_2^2 + * + * This function computes the contiguous submatrix of E of dimensions + * rows_eds by cols_eds, whose upper-left corner is offset by + * (ro_eds, co_eds) from the upper-left corner of the full matrix E. + * + * On exit, Eds contains that computed submatrix. + */ +template +void euclidean_distance_submatrix( + int64_t rows_x, int64_t cols_x, const T* X, const T* sq_colnorms_x, + int64_t rows_eds, int64_t cols_eds, T* Eds, int64_t ro_eds, int64_t co_eds +) { + randblas_require((0 <= co_eds) && ((co_eds + cols_eds) <= cols_x)); + randblas_require((0 <= ro_eds) && ((ro_eds + rows_eds) <= cols_x)); + const T* sq_colnorms_for_rows = sq_colnorms_x + ro_eds; + const T* sq_colnorms_for_cols = sq_colnorms_x + co_eds; + + std::vector ones(rows_eds, 1.0); + T* ones_d = ones.data(); + for (int64_t j = 0; j < cols_eds; ++j) { + T* Eds_col = Eds + rows_eds*j; + blas::copy(rows_eds, sq_colnorms_for_rows, 1, Eds_col, 1); + blas::axpy(rows_eds, sq_colnorms_for_cols[j], ones_d, 1, Eds_col, 1); + } + + const T* X_subros = X + rows_x * ro_eds; + const T* X_subcos = X + rows_x * co_eds; + blas::gemm( + blas::Layout::ColMajor, blas::Op::Trans, blas::Op::NoTrans, + rows_eds, cols_eds, rows_x, + -2.0, X_subros, rows_x, X_subcos, rows_x, 1.0, Eds, rows_eds + ); + return; +} + +template +T squared_exp_kernel(int64_t dim, const T* x, const T* y, T bandwidth) { + T sq_nrm = 0.0; + T scale = std::sqrt(2.0)*bandwidth; + for (int64_t i = 0; i < dim; ++i) { + T diff = (x[i] - y[i])/scale; + sq_nrm += diff*diff; + } + return std::exp(-sq_nrm); +} + +/*** + * X is a rows_x by cols_x matrix stored in column major format with + * leading dimension equal to rows_x; sq_colnorms_x is a buffer of + * length "cols_x" whose j-th entry is ||X(:,j)||_2^2. + * + * The squared exponential kernel with scale given by "bandwidth" is + * a matrix of the form + * + * K(i, j) = exp(- ||X(:,i) - X(:, J)||_2^2 / (2*bandwidth^2)) + * + * That is -- each column of X defines a datapoint, and K is the induced + * positive (semi)definite kernel matrix. + * + * This function computes the contiguous submatrix of K of dimensions + * rows_ksub by cols_ksub, whose upper-left corner is offset by + * (ro_ksub, co_ksub) from the upper-left corner of the full matrix K. + * + * The result is stored in "Ksub", which is interpreted in column-major + * order with leading dimension equal to rows_ksub. + */ +template +void squared_exp_kernel_submatrix( + int64_t rows_x, int64_t cols_x, const T* X, T* sq_colnorms_x, + int64_t rows_ksub, int64_t cols_ksub, T* Ksub, int64_t ro_ksub, int64_t co_ksub, + T bandwidth +) { + int64_t size_Ksub = rows_ksub * cols_ksub; + randblas_require(bandwidth > 0); + euclidean_distance_submatrix(rows_x, cols_x, X, sq_colnorms_x, rows_ksub, cols_ksub, Ksub, ro_ksub, co_ksub); + T scale = -1.0 / (2.0 * bandwidth * bandwidth); + auto inplace_exp = [scale](T &val) { val = std::exp(scale*val); }; + #pragma omp parallel for + for (int64_t i = 0; i < size_Ksub; ++i) { + inplace_exp(Ksub[i]); + } + return; +} + + +/** + * D = [A ][ B ] C + * [B'][ 0 ] + * + * where A is k-by-k, B is k-by-ell, and C has n columns. + * + * All matrices are column-major; A and B have leading dimension k. d + * + */ +template +void block_arrowhead_multiply(int64_t k, int64_t ell, int64_t n, const T* A, const T* B, const T* C, int64_t ldc, T* D, int64_t ldd ) { + auto layout = blas::Layout::ColMajor; + using blas::Op; + const T* C_top = C; + const T* C_bot = C + k; + T* D_top = D; + T* D_bot = D + k; + // + // Step 1. D_top += alpha * A * C_top + // + blas::gemm(layout, Op::NoTrans, Op::NoTrans, k, n, k, (T) 1.0, A, k, C_top, ldc, (T) 0.0, D_top, ldd); + if (ell > 0) { + // + // Step 2. D_top += alpha * B * C_bot + // + blas::gemm(layout, Op::NoTrans, Op::NoTrans, k, n, ell, (T) 1.0, B, k, C_bot, ldc, (T) 1.0, D_top, ldd); + // + // Step 3. D_bot += alpha * B' * C_top + // + blas::gemm(layout, Op::Trans, Op::NoTrans, ell, n, k, (T) 1.0, B, k, C_top, ldc, (T) 0.0, D_bot, ldd); + } + return; +} + + +namespace linops { + +/*** + * It might be practical to have one class that handles several different kinds of kernels. + */ +template +struct RBFKernelMatrix : public SymmetricLinearOperator { + // squared exp kernel linear operator + const T* X; + const int64_t rows_x; + T bandwidth; + vector regs; + + vector _sq_colnorms_x; + vector _eval_work1; + vector _eval_work2; + bool _eval_includes_reg; + int64_t _eval_block_size; + + using scalar_t = T; + + RBFKernelMatrix( + int64_t m, const T* X, int64_t rows_x, T bandwidth, vector ®s + ) : SymmetricLinearOperator(m), X(X), rows_x(rows_x), bandwidth(bandwidth), regs(regs), _sq_colnorms_x(m), _eval_work1{}, _eval_work2{} { + for (int64_t i = 0; i < m; ++i) { + _sq_colnorms_x[i] = std::pow(blas::nrm2(rows_x, X + i*rows_x, 1), 2); + } + _eval_block_size = std::min(m / ((int64_t) 4), (int64_t) 512); + _eval_work1.resize(_eval_block_size * m); + _eval_includes_reg = false; + return; + } + + void _prep_eval_work1(int64_t rows_ksub, int64_t cols_ksub, int64_t ro_ksub, int64_t co_ksub) { + randblas_require(rows_ksub * cols_ksub <= (int64_t) _eval_work1.size()); + squared_exp_kernel_submatrix( + rows_x, this->m, X, _sq_colnorms_x.data(), + rows_ksub, cols_ksub, _eval_work1.data(), ro_ksub, co_ksub, bandwidth + ); + } + + void set_eval_includes_reg(bool eir) { + _eval_includes_reg = eir; + } + + void operator()(blas::Layout layout, int64_t n, T alpha, T* const B, int64_t ldb, T beta, T* C, int64_t ldc) { + randblas_require(layout == blas::Layout::ColMajor); + randblas_require(ldb >= this->m); + randblas_require(ldc >= this->m); + + _eval_work2.resize(this->m * n); + for (int64_t i = 0; i < n; ++i) { + blas::scal(this->m, beta, C + i*ldc, 1); + } + int64_t done = 0; + int64_t todo = this->m; + while (todo > 0) { + int64_t k = std::min(_eval_block_size, todo); + _prep_eval_work1(k, todo, done, done); + const T* arrowhead_A = _eval_work1.data(); + const T* arrowhead_B = arrowhead_A + k * k; + const T* arrowhead_C = B + done; + T* arrowhead_D = _eval_work2.data(); + int64_t ell = (todo > k) ? (todo - k) : 0; + block_arrowhead_multiply(k, ell, n, arrowhead_A, arrowhead_B, arrowhead_C, ldb, arrowhead_D, todo); + for (int i = 0; i < n; ++i) { + blas::axpy(todo, alpha, arrowhead_D + i*todo, 1, C + done + i*ldc, 1); + } + done += k; + todo -= k; + } + if (_eval_includes_reg) { + int64_t num_regs = this->regs.size(); + randblas_require(num_regs == 1 || n == num_regs); + T* regsp = regs.data(); + for (int64_t i = 0; i < n; ++i) { + T coeff = alpha * regsp[std::min(i, num_regs - 1)]; + blas::axpy(this->m, coeff, B + i*ldb, 1, C + i*ldc, 1); + } + } + return; + } + + inline T operator()(int64_t i, int64_t j) { + T val = squared_exp_kernel(rows_x, X + i*rows_x, X + j*rows_x, bandwidth); + if (_eval_includes_reg) { + randblas_require(regs.size() == 1); + val += regs[0]; + } + return val; + } +}; + +} // end namespace RandLAPACK::linops + +} +#endif \ No newline at end of file diff --git a/RandLAPACK/misc/rl_util.hh b/RandLAPACK/misc/rl_util.hh index 15f516ae..eba68d9d 100644 --- a/RandLAPACK/misc/rl_util.hh +++ b/RandLAPACK/misc/rl_util.hh @@ -273,7 +273,7 @@ T estimate_spectral_norm( std::vector buf1 (m, 0.0); RandBLAS::DenseDist DV(n, 1); - state = RandBLAS::fill_dense(DV, buf.data(), state).second; + state = RandBLAS::fill_dense(DV, buf.data(), state); T prev_norm_inv = 1.0; for(int i = 0; i < p; ++i) { diff --git a/benchmark/CMakeLists.txt b/benchmark/CMakeLists.txt index 8be3b062..dd17e750 100644 --- a/benchmark/CMakeLists.txt +++ b/benchmark/CMakeLists.txt @@ -1,4 +1,7 @@ -cmake_minimum_required(VERSION 3.10) +cmake_minimum_required(VERSION 3.27) + +# cmake -DCMAKE_BUILD_TYPE=Release -DRandom123_DIR=`pwd`/../../../random123-install/include/ -Dblaspp_DIR=`pwd`/../../../blaspp-install/lib/cmake/blaspp/ -DRandLAPACK_DIR=`pwd`/../../../RandLAPACK-install/lib/cmake/ -Dlapackpp_DIR=`pwd`/../../../lapackpp-install/lib/cmake/lapackpp/ -DCMAKE_BINARY_DIR=`pwd` .. +# ^ Example CMake configuration line with "install," "build" and "randlibs" folders. # cmake -DCMAKE_BUILD_TYPE=Release -DRandom123_DIR=`pwd`/../../install/random123-install/include/ -Dblaspp_DIR=`pwd`/../../install/blaspp_GPU-install/lib64/blaspp/ -DRandLAPACK_DIR=`pwd`/../../install/RandLAPACK-install/lib64/cmake/ -Dlapackpp_DIR=`pwd`/../../install/lapackpp-install/lib64/lapackpp/ -DCMAKE_BINARY_DIR=`pwd` ../../randlibs/RandLAPACK/benchmark/ # ^ Example CMake configuration line with "install," "build" and "randlibs" folders. @@ -39,7 +42,7 @@ function(add_benchmark) set(MVO CXX_SOURCES LINK_LIBS) cmake_parse_arguments(PARSE_ARGV 0 TGT "${OPTS}" "${NVPO}" "${MVO}") add_executable(${TGT_NAME} ${TGT_CXX_SOURCES}) - target_compile_options(${TGT_NAME} PRIVATE "-g") + target_compile_options(${TGT_NAME} PUBLIC -O1) target_include_directories(${TGT_NAME} PUBLIC ${Benchmark_include_dirs}) target_link_libraries(${TGT_NAME} ${TGT_LINK_LIBS}) message(STATUS "RandLAPACK: added ${TGT_NAME} benchmark") @@ -108,3 +111,17 @@ add_benchmark(NAME ICQRRP_subroutines_speed CXX_SOURCES bench_CQRRP/ICQRRP_ add_benchmark(NAME RBKI_speed_comparisons CXX_SOURCES bench_RBKI/RBKI_speed_comparisons.cc LINK_LIBS ${Benchmark_libs}) add_benchmark(NAME RBKI_runtime_breakdown CXX_SOURCES bench_RBKI/RBKI_runtime_breakdown.cc LINK_LIBS ${Benchmark_libs}) add_benchmark(NAME RBKI_speed_comparisons_SVDS CXX_SOURCES bench_RBKI/RBKI_speed_comparisons_SVDS.cc LINK_LIBS ${Benchmark_libs_external}) + + + +# KRILL benchmarks +include(FetchContent) +FetchContent_Declare( + fast_matrix_market + GIT_REPOSITORY https://github.com/alugowski/fast_matrix_market + GIT_TAG main + GIT_SHALLOW TRUE +) +FetchContent_MakeAvailable(fast_matrix_market) +add_benchmark(NAME KRR_simple CXX_SOURCES bench_kernelalgs/kernelbench_common.hh bench_kernelalgs/krr.cc LINK_LIBS ${Benchmark_libs} fast_matrix_market::fast_matrix_market) +add_benchmark(NAME KPCA_simple CXX_SOURCES bench_kernelalgs/kernelbench_common.hh bench_kernelalgs/kpca.cc LINK_LIBS ${Benchmark_libs} fast_matrix_market::fast_matrix_market) diff --git a/benchmark/bench_kernelalgs/kernelbench_common.hh b/benchmark/bench_kernelalgs/kernelbench_common.hh new file mode 100644 index 00000000..eeaeb1a6 --- /dev/null +++ b/benchmark/bench_kernelalgs/kernelbench_common.hh @@ -0,0 +1,251 @@ +#pragma once + +#include +#include +#include +#include + +#include +#include +#include +#include +#include + +#include +#include + +#include + +using std_clock = std::chrono::high_resolution_clock; +using timepoint_t = std::chrono::time_point; +using std::chrono::duration_cast; +using std::chrono::microseconds; + +using RandBLAS::RNGState; +using RandLAPACK::rp_cholesky; +using lapack::gesdd; +using lapack::Job; +using std::vector; + +double sec_elapsed(timepoint_t tp0, timepoint_t tp1) { + return ((double) duration_cast(tp1 - tp0).count())/1e6; +} + +template +void transpose_colmajor( + int64_t m, int64_t n, const T* A, int64_t lda, T* AT, int64_t ldat +) { + for(int i = 0; i < n; ++i) + blas::copy(m, &A[i * lda], 1, &AT[i], ldat); +} + + +struct array_matrix { + int64_t nrows = 0, ncols = 0; + std::vector vals; +}; + +struct KRR_data { + array_matrix X_train; + array_matrix Y_train; + array_matrix X_test; + array_matrix Y_test; +}; + +void standardize(KRR_data &krrd) { + randblas_require(krrd.X_train.nrows == krrd.X_test.nrows); + using T = double; + int64_t d = krrd.X_train.nrows; + std::vector mu(d, 0.0); + std::vector sigma(d, 0.0); + RandLAPACK::standardize_dataset( + d, krrd.X_train.ncols, krrd.X_train.vals.data(), mu.data(), sigma.data(), false + ); + RandLAPACK::standardize_dataset( + d, krrd.X_test.ncols, krrd.X_test.vals.data(), mu.data(), sigma.data(), true + ); + return; +} + +array_matrix mmread_file(std::string fn, bool transpose = true) { + array_matrix mat{}; + std::ifstream file_stream(fn); + fast_matrix_market::read_matrix_market_array( + file_stream, mat.nrows, mat.ncols, mat.vals, fast_matrix_market::col_major + ); + if (transpose) { + array_matrix tmat{}; + tmat.nrows = mat.ncols; + tmat.ncols = mat.nrows; + tmat.vals.resize(mat.vals.size(), 0.0); + transpose_colmajor( + mat.nrows, mat.ncols, mat.vals.data(), mat.nrows, tmat.vals.data(), tmat.nrows + ); + return tmat; + } else { + return mat; + } +} + +KRR_data mmread_krr_data_dir(std::string dn) { + // mmread_file calls below always apply a transpose; might need to skip transposition for some + // datasets. + KRR_data data{}; + data.X_train = mmread_file(dn + "/Xtr.mm"); + data.Y_train = mmread_file(dn + "/Ytr.mm"); + data.X_test = mmread_file(dn + "/Xts.mm"); + data.Y_test = mmread_file(dn + "/Yts.mm"); + standardize(data); + return data; +} + +namespace memprof { +/* + * Author: David Robert Nadeau + * Site: http://NadeauSoftware.com/ + * License: Creative Commons Attribution 3.0 Unported License + * http://creativecommons.org/licenses/by/3.0/deed.en_US + */ + +#if defined(_WIN32) +#include +#include + +#elif defined(__unix__) || defined(__unix) || defined(unix) || \ + (defined(__APPLE__) && defined(__MACH__)) +#include +#include + +#if defined(__APPLE__) && defined(__MACH__) +#include + +#elif (defined(_AIX) || defined(__TOS__AIX__)) || \ + (defined(__sun__) || defined(__sun) || \ + defined(sun) && (defined(__SVR4) || defined(__svr4__))) +#include +#include + +#elif defined(__linux__) || defined(__linux) || defined(linux) || \ + defined(__gnu_linux__) +#include + +#endif + +#else +#error "Cannot define getPeakRSS( ) or getCurrentRSS( ) for an unknown OS." +#endif + +/** + * Returns the peak (maximum so far) resident set size (physical + * memory use) measured in bytes, or zero if the value cannot be + * determined on this OS. + */ +inline size_t getPeakRSS() { +#if defined(_WIN32) + /* Windows -------------------------------------------------- */ + PROCESS_MEMORY_COUNTERS info; + GetProcessMemoryInfo(GetCurrentProcess(), &info, sizeof(info)); + return (size_t)info.PeakWorkingSetSize; + +#elif (defined(_AIX) || defined(__TOS__AIX__)) || \ + (defined(__sun__) || defined(__sun) || \ + defined(sun) && (defined(__SVR4) || defined(__svr4__))) + /* AIX and Solaris ------------------------------------------ */ + struct psinfo psinfo; + int fd = -1; + if ((fd = open("/proc/self/psinfo", O_RDONLY)) == -1) + return (size_t)0L; /* Can't open? */ + if (read(fd, &psinfo, sizeof(psinfo)) != sizeof(psinfo)) { + close(fd); + return (size_t)0L; /* Can't read? */ + } + close(fd); + return (size_t)(psinfo.pr_rssize * 1024L); + +#elif defined(__unix__) || defined(__unix) || defined(unix) || \ + (defined(__APPLE__) && defined(__MACH__)) + /* BSD, Linux, and OSX -------------------------------------- */ + struct rusage rusage; + getrusage(RUSAGE_SELF, &rusage); +#if defined(__APPLE__) && defined(__MACH__) + return (size_t)rusage.ru_maxrss; +#else + return (size_t)(rusage.ru_maxrss * 1024L); +#endif + +#else + /* Unknown OS ----------------------------------------------- */ + return (size_t)0L; /* Unsupported. */ +#endif +} + +/** + * Returns the current resident set size (physical memory use) measured + * in bytes, or zero if the value cannot be determined on this OS. + */ +inline size_t getCurrentRSS() { +#if defined(_WIN32) + /* Windows -------------------------------------------------- */ + PROCESS_MEMORY_COUNTERS info; + GetProcessMemoryInfo(GetCurrentProcess(), &info, sizeof(info)); + return (size_t)info.WorkingSetSize; + +#elif defined(__APPLE__) && defined(__MACH__) + /* OSX ------------------------------------------------------ */ + struct mach_task_basic_info info; + mach_msg_type_number_t infoCount = MACH_TASK_BASIC_INFO_COUNT; + if (task_info(mach_task_self(), MACH_TASK_BASIC_INFO, (task_info_t)&info, + &infoCount) != KERN_SUCCESS) + return (size_t)0L; /* Can't access? */ + return (size_t)info.resident_size; + +#elif defined(__linux__) || defined(__linux) || defined(linux) || \ + defined(__gnu_linux__) + /* Linux ---------------------------------------------------- */ + long rss = 0L; + FILE *fp = NULL; + if ((fp = fopen("/proc/self/statm", "r")) == NULL) + return (size_t)0L; /* Can't open? */ + if (fscanf(fp, "%*s%ld", &rss) != 1) { + fclose(fp); + return (size_t)0L; /* Can't read? */ + } + fclose(fp); + return (size_t)rss * (size_t)sysconf(_SC_PAGESIZE); + +#else + /* AIX, BSD, Solaris, and Unknown OS ------------------------ */ + return (size_t)0L; /* Unsupported. */ +#endif +} + +// inline void log_pages() { +// static size_t pagesize = sysconf(_SC_PAGESIZE); +// int64_t bytes = getCurrentRSS(); +// assert((bytes % pagesize) == 0); +// size_t pages = bytes / pagesize; +// std::cout << "page size: " << pagesize << "\t"; +// std::cout << "bytes: " << bytes << "\t"; +// std::cout << "pages: " << pages << std::endl; +// return; +// } + +inline void log_pages(std::ostream &stream) { + static size_t pagesize = sysconf(_SC_PAGESIZE); + int64_t bytes = getCurrentRSS(); + assert((bytes % pagesize) == 0); + size_t pages = bytes / pagesize; + stream << "page size: " << pagesize << "\t"; + stream << "bytes: " << bytes << "\t"; + stream << "pages: " << pages << std::endl; + return; +} + +inline void log_memory_gb(std::ostream &stream) { + int64_t bytes = getCurrentRSS(); + double gb = ((double) bytes) / ((double) std::pow(1024,3)); + stream << " Memory (GB) : " << gb << "\n"; + return; +} + +} \ No newline at end of file diff --git a/benchmark/bench_kernelalgs/kpca.cc b/benchmark/bench_kernelalgs/kpca.cc new file mode 100644 index 00000000..547db1f3 --- /dev/null +++ b/benchmark/bench_kernelalgs/kpca.cc @@ -0,0 +1,140 @@ + +#include "kernelbench_common.hh" +#include +#include +#include +#include + + +#ifndef DOUT +#define DOUT(_d) std::setprecision(8) << _d +#endif + +using RandLAPACK::rp_cholesky; +using blas::Layout; +using lapack::gesdd; +using lapack::Job; +using std::vector; + + + +template +int cholsvd_square(int64_t m, int64_t n, T* A, int64_t lda, T* singvals_squared, T* work) { + auto layout = Layout::ColMajor; + auto uplo = blas::Uplo::Lower; + blas::syrk(layout, uplo, blas::Op::Trans, n, m, (T)1.0, A, lda, 0.0, work, n); + lapack::syevd(Job::Vec, uplo, n, work, n, singvals_squared); + // The first n*n entries in work hold the right singular vectors of A. + // But they're sorted in the wrong order! + for (int64_t j = 0; j < n/2; ++j) { + auto lead_off = j; + auto trail_off = n-j-1; + T* colj = work + lead_off * n; + T* coljtrail = work + trail_off * n; + for (int64_t i = 0; i < n; ++i) { + std::swap(colj[i], coljtrail[i]); + } + std::swap(singvals_squared[lead_off], singvals_squared[trail_off]); + } + T* trailing_work = work + n*n; + lapack::lacpy(lapack::MatrixType::General, m, n, A, m, trailing_work, m); + // trailing_work is a copy of A. + blas::gemm(layout, blas::Op::NoTrans, blas::Op::NoTrans, m, n, n, (T)1.0, trailing_work, m, work, n, (T)0.0, A, lda); + // invert the scale on each column of A. + for (int64_t i = 0; i < n; ++i) + blas::scal(m, (T) std::pow(singvals_squared[i], -0.5), A + i*lda, 1); + return 0; +} + +enum TSSVD : char { + GESDD = 'G', + CholSVD = 'C', + RandPrecondCholSVD = 'R' +}; + +template +std::pair convert_svd(int64_t m, int64_t rank, vector &U, vector &kevals, TSSVD cs, CALLBACK &cb) { + auto _tp0 = std_clock::now(); + if (cs == TSSVD::GESDD) { + vector work(rank*rank, 0.0); + gesdd(Job::OverwriteVec, m, rank, U.data(), m, kevals.data(), nullptr, 1, work.data(), rank); + for (int64_t i = 0; i < rank; ++i) + kevals[i] = std::pow(kevals[i], 2); + cb(0); + } else if (cs == TSSVD::CholSVD) { + vector work((rank + m)*rank, 0.0); + cholsvd_square(m, rank, U.data(), m, kevals.data(), work.data()); + cb(0); + } + auto _tp1 = std_clock::now(); + return {_tp0, _tp1}; +} + + +int main() { + //std::string dn{"/Users/rjmurr/Documents/open-data/kernel-ridge-regression/sensit_vehicle"}; + std::string dn{"/Users/rjmurr/Documents/open-data/kernel-ridge-regression/cod-rna"}; + auto krrd = mmread_krr_data_dir(dn); + using T = double; + int64_t m = krrd.X_train.ncols; + int64_t d = krrd.X_train.nrows; + std::cout << "\nDataset\n " << dn << std::endl; + std::cout << " cols : " << m << std::endl; + std::cout << " rows : " << d << "\n\n"; + vector mus{0.0}; + RandLAPACK::linops::RBFKernelMatrix K_reg(m, krrd.X_train.vals.data(), d, 3.0, mus); + K_reg.set_eval_includes_reg(false); + + // Variables for RPCholesky + int64_t rpchol_block_size = 64; + int64_t rank = (int64_t) std::sqrt(m); + vector U(m * rank, 0.0); + RNGState state(0); + vector selection(rank, -1); + + std::stringstream strm{}; + auto callback = [&strm](int64_t i) { memprof::log_memory_gb(strm); return i;}; + + std::cout << "RPCholesky (RPC)\n"; + std::cout << " block size : " << rpchol_block_size << std::endl; + std::cout << " rank limit : " << rank << std::endl; + auto _tp0 = std_clock::now(); + state = rp_cholesky(m, K_reg, rank, selection.data(), U.data(), rpchol_block_size, state, callback); + auto _tp1 = std_clock::now(); + std::cout << " exit rank : " << rank << std::endl; + std::cout << " RPC time (s) : " << DOUT(sec_elapsed(_tp0, _tp1)) << std::endl; + std::cout << strm.str(); + + strm.str(""); + strm.clear(); + + // Variables for SVD conversion + // We don't allocate these earlier, since "rank" might have decreased + // in the call to rp_cholesky. + vector kevals(rank, 0.0); +{ + auto [tp0, tp1] = convert_svd(m, rank, U, kevals, TSSVD::CholSVD, callback); + std::cout << " SVD time (s) : " << DOUT(sec_elapsed(tp0, tp1)) << "\n"; + std::cout << strm.str() << "\n"; +} + // Now check || K_reg @ U[:, 0:num_pc] - U[:,0:num_pc] @ diag(eivals[0:num_pc]) ||, + // or || K_reg @ U[:, 0:num_pc] @ inv(diag(eigvals[0:num_pc])) - U[:,0:num_pc]|| + int64_t num_pc = 5; + vector V(m*num_pc, 0.0); + T onef = 1.0; + K_reg(blas::Layout::ColMajor, num_pc, onef, U.data(), m, (T)0.0, V.data(), m); + for (int64_t i = 0; i < num_pc; ++i) + blas::scal(m, onef/kevals[i], V.data() + i*m, 1); + // ^ Now, V = K_reg @ U[:, 0:num_pc] @ inv(diag(eigvals[0:num_pc])) + vector W(V); + // subtract off U + for (int64_t i = 0; i < m*num_pc; ++i) + W[i] -= U[i]; + // compute column norms of W. + std::cout << "Error in KPCA components " << std::endl; + for (int64_t i = 0; i < num_pc; ++i) { + std::cout << " component " << i << " : " << DOUT(blas::nrm2(m, W.data()+i*m, 1)) << std::endl; + } + std::cout << std::endl; + return 0; +} diff --git a/benchmark/bench_kernelalgs/krr.cc b/benchmark/bench_kernelalgs/krr.cc new file mode 100644 index 00000000..f0e85cfe --- /dev/null +++ b/benchmark/bench_kernelalgs/krr.cc @@ -0,0 +1,74 @@ + +#include "kernelbench_common.hh" + +#ifndef DOUT +#define DOUT(_d) std::setprecision(std::numeric_limits::max_digits10) << _d +#endif + +#ifndef TIMED_LINE +#define TIMED_LINE(_op, _name) { \ + auto _tp0 = std_clock::now(); \ + _op; \ + auto _tp1 = std_clock::now(); \ + auto dtime = sec_elapsed(_tp0, _tp1); \ + std::cout << _name << DOUT(dtime) << std::endl; \ + } +#endif + +int main() { + //std::string dn{"/Users/rjmurr/Documents/open-data/kernel-ridge-regression/sensit_vehicle"}; + std::string dn{"/Users/rjmurr/Documents/open-data/kernel-ridge-regression/cod-rna"}; + auto krrd = mmread_krr_data_dir(dn); + using T = double; + int64_t m = krrd.X_train.ncols; + int64_t d = krrd.X_train.nrows; + std::cout << "cols : " << m << std::endl; + std::cout << "rows : " << d << std::endl; + T mu_min = m * 1e-7; + vector mus{mu_min}; + RandLAPACK::linops::RBFKernelMatrix A_linop(m, krrd.X_train.vals.data(), d, 3.0, mus); + for (int64_t s = 1; s <= 8; s*=2) { + vector H(m*s, 0.0); + + T* Hd = H.data(); + T* hd = krrd.Y_train.vals.data(); + blas::copy(m, hd, 1, Hd, 1); + if (s > 1) { + RNGState state_H(1); + RandBLAS::DenseDist D(m, s - 1, RandBLAS::ScalarDist::Gaussian); + RandBLAS::fill_dense(D, Hd + m, state_H); + T nrm_h = blas::nrm2(m, hd, 1); + for (int i = 1; i < s; ++i) { + // T nrm_Hi = blas::nrm2(m, Hd + i*m, 1); + // T scale = std::pow(2.0*nrm_Hi, -1); + // blas::scal(m, scale, Hd + i*m, 1); + // blas::axpy(m, 1.0, hd, 1, Hd + i*m, 1); + T nrm_Hi = blas::nrm2(m, Hd + i*m, 1); + T scale = nrm_h / nrm_Hi; + blas::scal(m, scale, Hd + i*m, 1); + } + } + + vector X(m*s, 0.0); + // solve A_linop X == H + RNGState state(0); + auto seminorm = [](int64_t n, int64_t s, const T* NR){return blas::nrm2(n, NR, 1);}; + int64_t k = 2*1024; + int64_t rpc_b = 64; + int64_t eval_block_size = 1024; + std::cout << "k : " << k << std::endl; + std::cout << "s : " << s << std::endl; + std::cout << "mu0 : " << mu_min << std::endl; + std::cout << "rpc_b : " << rpc_b << std::endl << std::endl; + T tol = std::pow(std::numeric_limits::epsilon(), 0.75); + int64_t max_iters = 25; + A_linop._eval_block_size = eval_block_size; + A_linop._eval_work1.resize(A_linop._eval_block_size * m); + TIMED_LINE( + RandLAPACK::krill_full_rpchol( + m, A_linop, H, X, tol, state, seminorm, rpc_b, max_iters, k + );, "\nKrill : ") + std::cout << std::endl; + } + return 0; +} diff --git a/benchmark/bench_kernelalgs/logging.txt b/benchmark/bench_kernelalgs/logging.txt new file mode 100644 index 00000000..3e95f9ae --- /dev/null +++ b/benchmark/bench_kernelalgs/logging.txt @@ -0,0 +1,173 @@ + +KPCA +==== +Our implementation + Dataset + /Users/rjmurr/Documents/open-data/kernel-ridge-regression/cod-rna + cols : 59535 + rows : 8 + + RPCholesky (RPC) + block size : 64 + rank limit : 243 + exit rank : 243 + RPC time (s) : 0.058426 + SVD time (s) : 0.056271 + + Error in KPCA components + component 0 : 3.165351e-10 + component 1 : 2.4040413e-08 + +Python implementations + + Dataset dimensions (RandLAPACK's convention) + n_rows : 8 + n_cols : 59535 + 44.33819890022278 seconds for sklearn's KPCA. + + + Dataset dimensions (RandLAPACK's convention) + n_rows : 8 + n_cols : 59535 + 0.3426549434661865 seconds for Ethan's RPCholesky, with block size 64. + + + +KRR +=== +CONLUSIONS + +1. Performance is best with -O1 (13 seconds vs 17 seconds from -O0, but same result as -O0). + +2. The nature of the preconditioner varies (specifically, where we encounter a Cholesky failure, + and hence the value of the preconditioner's rank) depending on if we use -O2 or -O3, + *provided* we *don't* have the "-fsanitize=undefined" flag. + +NEXT STEPS + +* Easy: run the larger guy and see how much faster we are with -O1. + +* Medium: export the rp_cholesky preconditioner from MATLAB into a matrixmarket file. + Make a script to run lockorblock_pcg using the SpectralPrecond induced by that matrix. + +//////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////// +//////////////////////////////////////////////////////////// + + +////// Debug (evidently the same as BLANK) ////////// + +(rb311) (base) s1104997ca:build rjmurr$ ./KRILL_simple +cols : 59535 +rows : 8 +k : 1024 +s : 1 +mu0 : 0.00059535 +rpc_b : 64 + +Cholesky failed with exit code 40. +Returning early, with approximation rank = 956 + +normNR : 113.48 normR : 243.998 k: 0 dim : 0 +normNR : 0.000378058 normR : 0.000443416 k: 1 dim : 1 +normNR : 3.85744e-09 normR : 4.38204e-09 k: 2 dim : 2 +normNR : 3.71858e-14 normR : 4.1411e-14 k: 3 dim : 3 + +Krill : 16.454101000000001 + + +////// Release ////////// + +(rb311) (base) s1104997ca:build rjmurr$ ./KRILL_simple +cols : 59535 +rows : 8 +k : 1024 +s : 1 +mu0 : 0.00059535 +rpc_b : 64 + +Cholesky failed with exit code 44. +Returning early, with approximation rank = 319 + +normNR : 113.51 normR : 243.998 k: 0 dim : 0 +normNR : 0.208441 normR : 0.281245 k: 1 dim : 1 +normNR : 0.00179702 normR : 0.00254381 k: 2 dim : 2 +normNR : 1.22709e-05 normR : 1.7396e-05 k: 3 dim : 3 +normNR : 1.25328e-07 normR : 1.79491e-07 k: 4 dim : 4 +normNR : 1.00295e-09 normR : 1.49707e-09 k: 5 dim : 5 +normNR : 1.06377e-11 normR : 1.4495e-11 k: 6 dim : 6 + +Krill : 16.997389999999999 + + +////// RelWithDebInfo ///// DEFAULT, -O2 ////////// + +(rb311) (base) s1104997ca:build rjmurr$ ./KRILL_simple +cols : 59535 +rows : 8 +k : 1024 +s : 1 +mu0 : 0.00059535 +rpc_b : 64 + +Cholesky failed with exit code 44. +Returning early, with approximation rank = 319 + +normNR : 113.51 normR : 243.998 k: 0 dim : 0 +normNR : 0.208441 normR : 0.281245 k: 1 dim : 1 +normNR : 0.00179702 normR : 0.00254381 k: 2 dim : 2 +normNR : 1.22709e-05 normR : 1.7396e-05 k: 3 dim : 3 +normNR : 1.25328e-07 normR : 1.79491e-07 k: 4 dim : 4 +normNR : 1.00295e-09 normR : 1.49707e-09 k: 5 dim : 5 +normNR : 1.06377e-11 normR : 1.4495e-11 k: 6 dim : 6 + +Krill : 17.326007000000001 + +////// MinRelRelWithDebInfo //// changed to have -O1 /////////// + +(rb311) (base) s1104997ca:build rjmurr$ ./KRILL_simple +cols : 59535 +rows : 8 +k : 1024 +s : 1 +mu0 : 0.00059535 +rpc_b : 64 + +Cholesky failed with exit code 40. +Returning early, with approximation rank = 956 + +normNR : 113.48 normR : 243.998 k: 0 dim : 0 +normNR : 0.000378058 normR : 0.000443417 k: 1 dim : 1 +normNR : 3.85744e-09 normR : 4.38216e-09 k: 2 dim : 2 +normNR : 3.71859e-14 normR : 4.14125e-14 k: 3 dim : 3 + +Krill : 13.233637999999999 + + + + +////// Release ////////// + +(rb311) (base) s1104997ca:build rjmurr$ ./KRILL_simple +cols : 59535 +rows : 8 +k : 1024 +s : 1 +mu0 : 0.00059535 +rpc_b : 64 + +Cholesky failed with exit code 40. +Returning early, with approximation rank = 956 + +normNR : 113.48 normR : 243.998 k: 0 dim : 0 +normNR : 0.000378058 normR : 0.000443417 k: 1 dim : 1 +normNR : 3.85744e-09 normR : 4.38216e-09 k: 2 dim : 2 +normNR : 3.71859e-14 normR : 4.14125e-14 k: 3 dim : 3 + +Krill : 37.660519000000001 + + + + +I've observed something unexpected. If I apply the compiler flags "-Wall -Wextra -pedantic -fsanitize=undefined" then the behavior of my program with -O3 matches the behavior of my program with -O1. If the program's behavior differs when I use these flags then I would expect the compiler to tell me \ No newline at end of file diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt index 61169d12..31ac0c5c 100644 --- a/test/CMakeLists.txt +++ b/test/CMakeLists.txt @@ -6,19 +6,24 @@ if (GTest_FOUND) set(tmp TRUE) set(RandLAPACK_test_srcs + moremats.hh comps/test_determiter.cc - comps/test_util.cc comps/test_orth.cc comps/test_qb.cc comps/test_preconditioners.cc comps/test_rf.cc comps/test_syrf.cc + comps/test_rpchol.cc drivers/test_rsvd.cc drivers/test_cqrrpt.cc drivers/test_cqrrp.cc drivers/test_revd2.cc drivers/test_hqrrp.cc drivers/test_rbki.cc + drivers/test_krillx.cc + misc/test_util.cc + misc/test_pdkernels.cc + misc/test_linops.cc ) # Create non-CUDA test executable diff --git a/test/comps/test_determiter.cc b/test/comps/test_determiter.cc index 40d9fb74..89257621 100644 --- a/test/comps/test_determiter.cc +++ b/test/comps/test_determiter.cc @@ -1,13 +1,21 @@ #include "RandLAPACK.hh" #include "rl_blaspp.hh" +#include "../RandLAPACK/RandBLAS/test/comparison.hh" #include #include #include +template +std::vector eye(int64_t n) { + std::vector A(n * n, 0.0); + for (int i = 0; i < n; ++i) + A[i + n*i] = 1.0; + return A; +} + -class TestDetermiterOLS : public ::testing::Test -{ +class TestDetermiterOLS : public ::testing::Test { protected: int64_t m = 201; int64_t n = 12; @@ -17,13 +25,14 @@ class TestDetermiterOLS : public ::testing::Test virtual void TearDown() {}; - virtual void run(uint64_t key_index) - { + virtual void run(uint64_t key_index) { + std::vector A(m * n); - RandBLAS::util::genmat(m, n, A.data(), keys[key_index]); + RandBLAS::RNGState state0(keys[key_index]); + auto state1 = RandBLAS::fill_dense({m, n}, A.data(), state0); std::vector b(m); - RandBLAS::util::genmat(m, 1, b.data(), keys[key_index] + (uint64_t) 1); + RandBLAS::fill_dense({m, 1}, b.data(), state1); std::vector c(n, 0.0); std::vector x0(n, 0.0); @@ -39,9 +48,10 @@ class TestDetermiterOLS : public ::testing::Test double delta = 0.1; double tol = 1e-8; - RandLAPACK::pcg( + RandLAPACK::pcg_saddle( m, n, A.data(), m, b.data(), c.data(), delta, - resid_vec, tol, n, M.data(), n, x0.data(), x.data(), y.data()); + resid_vec, tol, n, M.data(), n, x0.data(), x.data(), y.data() + ); int64_t iter_count = 0; @@ -64,3 +74,129 @@ TEST_F(TestDetermiterOLS, Trivial) { run(k_idx); } } + + +class TestDetermiterLockBlockPCG : public ::testing::Test { + protected: + + virtual void SetUp() {}; + + virtual void TearDown() {}; + + template + void run_simple_block(int64_t m, int64_t s, T coeff, uint32_t seed) { + using std::vector; + auto layout = blas::Layout::ColMajor; + vector G_buff(m*m, 0.0); + vector H(m*s, 0.0); + randblas_require((int64_t) H.size() == m*s); + vector X_star(m*s, 0.0); + vector X_init(m*s, 0.0); + RandBLAS::RNGState state0(seed); + vector temp(2*m*m); + auto D = RandBLAS::DenseDist {2*m, m, RandBLAS::ScalarDist::Gaussian}; + auto state1 = RandBLAS::fill_dense(D, temp.data(), state0); + blas::syrk(layout, blas::Uplo::Upper, blas::Op::Trans, m, 2*m, 1.0, temp.data(), 2*m, 0.0, G_buff.data(), m); + + vector regs(1, coeff); + RandLAPACK::linops::RegExplicitSymLinOp G(m, G_buff.data(), m, regs); + RandBLAS::DenseDist DX_star {m, s, RandBLAS::ScalarDist::Gaussian}; + auto Xsd = X_star.data(); + auto state2 = RandBLAS::fill_dense(DX_star, Xsd, state1); + G(layout, s, 1.0, X_star.data(), m, 0.0, H.data(), m); + + RandLAPACK::StatefulFrobeniusNorm seminorm{}; + + auto I_buff = eye(m); + vector zeros(1, 0.0); + RandLAPACK::linops::RegExplicitSymLinOp I(m, I_buff.data(), m, zeros); + + T tol = 100*std::numeric_limits::epsilon(); + RandLAPACK::lockorblock_pcg(G, H, tol, m, I, seminorm, X_init, true); + + T tol_scale = std::sqrt((T)m); + T atol = tol_scale * std::pow(std::numeric_limits::epsilon(), 0.5); + T rtol = tol_scale * atol; + test::comparison::buffs_approx_equal(X_init.data(), X_star.data(), m * s, + __PRETTY_FUNCTION__, __FILE__, __LINE__, atol, rtol + ); + return; + } + + virtual void run_simple_lockstep(int64_t m, int64_t s, uint32_t seed) { + using T = double; + randblas_require(s <= 4); + using std::vector; + vector reg_coeffs{}; + reg_coeffs.push_back(100); + if (s > 1) + reg_coeffs.push_back(7); + if (s > 2) + reg_coeffs.push_back(0.1); + if (s > 3) + reg_coeffs.push_back(0.5483); + auto layout = blas::Layout::ColMajor; + vector G_buff(m*m, 0.0); + vector H(m*s, 0.0); + vector X_star(m*s, 0.0); + vector X_init(m*s, 0.0); + RandBLAS::RNGState state0(seed); + vector temp(2*m*m); + + auto D = RandBLAS::DenseDist {2*m, m, RandBLAS::ScalarDist::Gaussian}; + auto state1 = RandBLAS::fill_dense(D, temp.data(), state0); + blas::syrk(layout, blas::Uplo::Upper, blas::Op::Trans, m, 2*m, 1.0, temp.data(), 2*m, 0.0, G_buff.data(), m); + + vector regs(reg_coeffs); + RandLAPACK::linops::RegExplicitSymLinOp G(m, G_buff.data(), m, regs); + RandBLAS::DenseDist DX_star {m, s, RandBLAS::ScalarDist::Gaussian}; + auto Xsd = X_star.data(); + auto state2 = RandBLAS::fill_dense(DX_star, Xsd, state1); + G(layout, s, 1.0, X_star.data(), m, 0.0, H.data(), m); + + RandLAPACK::StatefulFrobeniusNorm seminorm{}; + + auto I_buff = eye(m); + vector zeros(s, 0.0); + RandLAPACK::linops::RegExplicitSymLinOp I(m, I_buff.data(), m, zeros); + + T tol = 100*std::numeric_limits::epsilon(); + RandLAPACK::lockorblock_pcg(G, H, tol, m, I, seminorm, X_init, true); + + T tol_scale = std::sqrt((T)m); + T atol = tol_scale * std::pow(std::numeric_limits::epsilon(), 0.5); + T rtol = tol_scale * atol; + test::comparison::buffs_approx_equal(X_init.data(), X_star.data(), m * s, + __PRETTY_FUNCTION__, __FILE__, __LINE__, atol, rtol + ); + return; + } +}; + + +TEST_F(TestDetermiterLockBlockPCG, test_run_simple_block_5_1) { + run_simple_block(5, 1, 0.5, 1997); +} + +TEST_F(TestDetermiterLockBlockPCG, test_run_simple_block_6_2) { + run_simple_block(6, 2, 0.5, 1997); +} + +TEST_F(TestDetermiterLockBlockPCG, test_run_simple_block_5_4) { + run_simple_block(5, 4, 0.5, 1997); +} + +TEST_F(TestDetermiterLockBlockPCG, test_run_simple_lockstep_5_1) { + run_simple_lockstep(5, 1, 1997); + run_simple_lockstep(5, 1, 2024); +} + +TEST_F(TestDetermiterLockBlockPCG, test_run_simple_lockstep_6_2) { + run_simple_lockstep(6, 2, 1997); + run_simple_lockstep(6, 2, 2024); +} + +TEST_F(TestDetermiterLockBlockPCG, test_run_simple_lockstep_5_4) { + run_simple_lockstep(5, 4, 1997); + run_simple_lockstep(5, 4, 2024); +} diff --git a/test/comps/test_orth.cc b/test/comps/test_orth.cc index a31e9690..1034f6f2 100644 --- a/test/comps/test_orth.cc +++ b/test/comps/test_orth.cc @@ -59,7 +59,7 @@ class TestOrth : public ::testing::Test // Fill the gaussian random matrix RandBLAS::DenseDist D(n, k); - state = RandBLAS::fill_dense(D, all_data.Omega.data(), state).second; + state = RandBLAS::fill_dense(D, all_data.Omega.data(), state); // Generate a reference identity RandLAPACK::util::eye(k, k, all_data.I_ref); diff --git a/test/comps/test_pcgls.cc b/test/comps/test_pcgls.cc index 9181d932..ff1e46c7 100644 --- a/test/comps/test_pcgls.cc +++ b/test/comps/test_pcgls.cc @@ -29,7 +29,7 @@ void run_pcgls_ex(int n, int m) double delta = 0.1; double tol = 1e-8; - RandLAPACK::pcg(m, n, A.data(), m, b.data(), c.data(), delta, + RandLAPACK::pcg_saddle(m, n, A.data(), m, b.data(), c.data(), delta, resid_vec, tol, n, M.data(), n, x0.data(), x.data(), y.data()); for (double res: resid_vec) diff --git a/test/comps/test_preconditioners.cc b/test/comps/test_preconditioners.cc index 3b7c3fc6..3b34e6f5 100644 --- a/test/comps/test_preconditioners.cc +++ b/test/comps/test_preconditioners.cc @@ -5,31 +5,43 @@ #include #include +#include "../moremats.hh" +#include "../RandLAPACK/RandBLAS/test/comparison.hh" + + +using std::vector; +using blas::Layout; +using blas::Op; +using RandBLAS::DenseDist; +using RandBLAS::SparseDist; +using RandBLAS::RNGState; +using RandLAPACK_Testing::polynomial_decay_psd; + template void check_condnum_after_precond( - blas::Layout layout, - std::vector &A, - std::vector &M_wk, + Layout layout, + vector &A, + vector &M_wk, int64_t rank, int64_t m, int64_t n ) { - std::vector A_pc(m * rank, 0.0); - bool is_colmajor = layout == blas::Layout::ColMajor; + vector A_pc(m * rank, 0.0); + bool is_colmajor = layout == Layout::ColMajor; int64_t lda = (is_colmajor) ? m : n; int64_t ldm = (is_colmajor) ? n : rank; int64_t ldapc = (is_colmajor) ? m : rank; blas::gemm( layout, - blas::Op::NoTrans, - blas::Op::NoTrans, + Op::NoTrans, + Op::NoTrans, m, rank, n, 1.0, A.data(), lda, M_wk.data(), ldm, 0.0, A_pc.data(), ldapc ); - std::vector s(rank, 0.0); + vector s(rank, 0.0); if (is_colmajor) { lapack::gesvd(lapack::Job::NoVec, lapack::Job::NoVec, m, rank, A_pc.data(), ldapc, s.data(), nullptr, 1, nullptr, 1 @@ -51,7 +63,7 @@ class Test_rpc_svd : public ::testing::Test static inline int64_t m = 500; static inline int64_t n = 10; static inline int64_t d = 30; - static inline std::vector keys = {42, 1}; + static inline vector keys = {42, 1}; static inline double sqrt_cond = 1e5; static inline double mu = 1e-6; // only used in "full_rank_after_reg" test. @@ -72,16 +84,16 @@ class Test_rpc_svd : public ::testing::Test template void test_full_rank_without_reg( int key_index, - blas::Layout layout + Layout layout ){ // construct "A" with cond(A) >= sqrt_cond^2. - std::vector A(m*n, 0.0); + vector A(m*n, 0.0); T *a = A.data(); - RandBLAS::DenseDist D(m, n, RandBLAS::DenseDistName::Uniform); - auto state = RandBLAS::RNGState(99); + DenseDist D(m, n, RandBLAS::ScalarDist::Uniform); + auto state = RNGState(99); RandBLAS::fill_dense(D, a, state); - if (layout == blas::Layout::RowMajor) { + if (layout == Layout::RowMajor) { // scale first row up by sqrt_cond // scale second row down by sqrt_cond blas::scal(n, sqrt_cond, a, 1); @@ -96,11 +108,11 @@ class Test_rpc_svd : public ::testing::Test } // apply the function under test (rpc_data_svd_saso) - auto alg_state = RandBLAS::RNGState((uint32_t) keys[key_index]); - std::vector M_wk(d*n, 0.0); - std::vector sigma_sk(n, 0.0); - int64_t lda = (layout == blas::Layout::ColMajor) ? m : n; - RandBLAS::SparseDist SDist{.n_rows=d, .n_cols=m, .vec_nnz=8}; + auto alg_state = RNGState((uint32_t) keys[key_index]); + vector M_wk(d*n, 0.0); + vector sigma_sk(n, 0.0); + int64_t lda = (layout == Layout::ColMajor) ? m : n; + SparseDist SDist(d, m, 8, RandBLAS::Axis::Short); RandBLAS::SparseSkOp S(SDist, alg_state); RandBLAS::fill_sparse(S); @@ -132,10 +144,10 @@ class Test_rpc_svd : public ::testing::Test ){ // construct an ill-conditioned matrix, then zero out first column. // After regularization the augmented matrix will still be full-rank. - std::vector A(m*n, 0.0); + vector A(m*n, 0.0); double *a = A.data(); - RandBLAS::DenseDist D(m, n, RandBLAS::DenseDistName::Uniform); - auto state = RandBLAS::RNGState(99); + DenseDist D(m, n, RandBLAS::ScalarDist::Uniform); + auto state = RNGState(99); RandBLAS::fill_dense(D, a, state); blas::scal(n, sqrt_cond, a, 1); @@ -144,20 +156,20 @@ class Test_rpc_svd : public ::testing::Test blas::scal(m, 0.0, a, n); // apply the function under test (rpc_svd_saso) - std::vector M_wk(d*n, 0.0); - std::vector sigma_sk(n, 0.0); - auto alg_state = RandBLAS::RNGState(keys[key_index]); + vector M_wk(d*n, 0.0); + vector sigma_sk(n, 0.0); + auto alg_state = RNGState(keys[key_index]); RandLAPACK::rpc_data_svd_saso( - blas::Layout::RowMajor, m, n, d, 8, + Layout::RowMajor, m, n, d, 8, A.data(), n, M_wk.data(), sigma_sk.data(), alg_state ); int64_t rank = RandLAPACK::make_right_orthogonalizer( - blas::Layout::RowMajor, + Layout::RowMajor, n, M_wk.data(), sigma_sk.data(), mu ); EXPECT_EQ(rank, n); - std::vector A_aug((m + n)*n, 0.0); + vector A_aug((m + n)*n, 0.0); double *a_aug = A_aug.data(); blas::copy(m*n, a, 1, a_aug, 1); double sqrt_mu = std::sqrt(mu); @@ -165,20 +177,20 @@ class Test_rpc_svd : public ::testing::Test for (int i = 0; i < n; ++i) sqrt_mu_eye[n*i + i] = sqrt_mu; - check_condnum_after_precond(blas::Layout::RowMajor, A_aug, M_wk, rank, m + n, n); + check_condnum_after_precond(Layout::RowMajor, A_aug, M_wk, rank, m + n, n); } }; TEST_F(Test_rpc_svd, FullRankNoReg_rowmajor_double) { - test_full_rank_without_reg(0, blas::Layout::RowMajor); - test_full_rank_without_reg(1, blas::Layout::RowMajor); + test_full_rank_without_reg(0, Layout::RowMajor); + test_full_rank_without_reg(1, Layout::RowMajor); } TEST_F(Test_rpc_svd, FullRankNoReg_colmajor_double) { - test_full_rank_without_reg(0, blas::Layout::ColMajor); - test_full_rank_without_reg(1, blas::Layout::ColMajor); + test_full_rank_without_reg(0, Layout::ColMajor); + test_full_rank_without_reg(1, Layout::ColMajor); } TEST_F(Test_rpc_svd, FullRankAfterReg) @@ -187,51 +199,27 @@ TEST_F(Test_rpc_svd, FullRankAfterReg) test_full_rank_after_reg(1); } -class TestNystromPrecond : public ::testing::Test -{ + +/*** + * This actually assesses quality of the Nystrom preconditioner. + */ +class TestNystromPrecond : public ::testing::Test { protected: static inline int64_t m = 500; - static inline std::vector keys = {42, 1}; + static inline vector keys = {42, 1}; virtual void SetUp() {}; virtual void TearDown() {}; template - void set_invP(int64_t m, int64_t k, T* V, T* lambda, T mu, T* invP) { - // compute invP = V * diag((min(lambda) + mu)/(lambda + mu)) * V' + (I - VV'). - RandLAPACK::util::eye(m, m, invP); - blas::gemm(blas::Layout::ColMajor, blas::Op::NoTrans, blas::Op::Trans, - m, m, k, -1.0, V, m, V, m, 1.0, invP, m - ); - for (int i = 0; i < k; ++i) { - blas::gemm(blas::Layout::ColMajor, blas::Op::NoTrans, blas::Op::Trans, - m, m, 1, (lambda[k-1] + mu) / (lambda[i] + mu), - &V[i*m], m, &V[i*m], m, 1.0, invP, m - ); - } - }; - - template - void set_G_mu_pre(int64_t m, T* G, T mu, T* invP, T* G_mu_pre) { - // G_mu_pre = (G + mu)*invP - blas::copy(m * m, invP, 1, G_mu_pre, 1); - blas::scal(m * m, mu, G_mu_pre, 1); - blas::symm(blas::Layout::ColMajor, blas::Side::Left, blas::Uplo::Lower, - m, m, 1.0, G, m, invP, m, 1.0, G_mu_pre, m - ); - for(int i = 1; i < m; ++i) - blas::copy(m - i, &G_mu_pre[i + ((i-1) * m)], 1, &G_mu_pre[(i - 1) + (i * m)], m); - }; - - template - void run(int key_index, std::vector &G) { + void run(int key_index, vector &G) { /* Run the algorithm under test */ - RandBLAS::RNGState alg_state(keys[key_index]); + RNGState alg_state(keys[key_index]); alg_state.key.incr(); - std::vector V(0); - std::vector lambda(0); + vector V(0); + vector lambda(0); int64_t k = 1; T mu_min = 1e-5; RandLAPACK::nystrom_pc_data( @@ -239,49 +227,53 @@ class TestNystromPrecond : public ::testing::Test ); // k has been updated. /* Verify algorithm output */ - EXPECT_TRUE(k > 5); + EXPECT_TRUE(k > 2); EXPECT_TRUE(k < m); - std::vector invP(m * m, 0.0); - std::vector G_mu_pre(m * m, 0.0); + RandLAPACK::linops::SpectralPrecond invP(m); + vector G_mu_pre(m * m, 0.0); + vector G_mu(m * m); + vector mus(1); + vector s(m); + mus[0] = mu_min; + G_mu = G; + for (int64_t i = 0; i < m; ++i) + G_mu[i + i*m] += mus[0]; + invP.prep(V, lambda, mus, m); + invP.evaluate(m, G_mu.data(), G_mu_pre.data()); T cond_lim = 5; - T mu = mu_min; - std::vector s(m); - set_invP(m, k, V.data(), lambda.data(), mu, invP.data()); - set_G_mu_pre(m, G.data(), mu, invP.data(), G_mu_pre.data()); lapack::gesvd(lapack::Job::NoVec, lapack::Job::NoVec, m, m, G_mu_pre.data(), m, s.data(), nullptr, 1, nullptr, 1); T cond = s[0] / s[m-1]; EXPECT_LE(cond, cond_lim); - mu *= 10; + mus[0] *= 10; + G_mu = G; + for (int64_t i = 0; i < m; ++i) + G_mu[i + i*m] += mus[0]; + invP.prep(V, lambda, mus, m); + invP.evaluate(m, G_mu.data(), G_mu_pre.data()); cond_lim /= 2; - set_invP(m, k, V.data(), lambda.data(), mu, invP.data()); - set_G_mu_pre(m, G.data(), mu, invP.data(), G_mu_pre.data()); lapack::gesvd(lapack::Job::NoVec, lapack::Job::NoVec, m, m, G_mu_pre.data(), m, s.data(), nullptr, 1, nullptr, 1); cond = s[0] / s[m-1]; EXPECT_LE(cond, cond_lim); - mu *= 10; + mus[0] *= 10; + G_mu = G; + for (int64_t i = 0; i < m; ++i) + G_mu[i + i*m] += mus[0]; + invP.prep(V, lambda, mus, m); + invP.evaluate(m, G_mu.data(), G_mu_pre.data()); cond_lim /= 2; - set_invP(m, k, V.data(), lambda.data(), mu, invP.data()); - set_G_mu_pre(m, G.data(), mu, invP.data(), G_mu_pre.data()); lapack::gesvd(lapack::Job::NoVec, lapack::Job::NoVec, m, m, G_mu_pre.data(), m, s.data(), nullptr, 1, nullptr, 1); cond = s[0] / s[m-1]; EXPECT_LE(cond, cond_lim); - }; + } }; + TEST_F(TestNystromPrecond, basictest) { - RandLAPACK::gen::mat_gen_info mat_info(m, m, RandLAPACK::gen::polynomial); - mat_info.cond_num = 1e6; - mat_info.rank = m; - mat_info.exponent = 2.0; - std::vector A(m * m, 0.0); - RandBLAS::RNGState data_state(0); - RandLAPACK::gen::mat_gen(mat_info, A.data(), data_state); - std::vector G(m * m, 0.0); - blas::syrk(Layout::ColMajor, Uplo::Lower, Op::NoTrans, m, m, 1.0, - A.data(), m, 0.0, G.data(), m - ); // Note: G is PSD with squared spectrum of A. - run(0, G); + auto G = polynomial_decay_psd(m, 1e12, 2.0, 99); + run(0, G); + run(1, G); } + diff --git a/test/comps/test_rpchol.cc b/test/comps/test_rpchol.cc new file mode 100644 index 00000000..03c281d2 --- /dev/null +++ b/test/comps/test_rpchol.cc @@ -0,0 +1,173 @@ +#include "RandLAPACK.hh" +#include "rl_rpchol.hh" +#include "rl_blaspp.hh" +#include "rl_gen.hh" +#include "../RandLAPACK/RandBLAS/test/comparison.hh" + +#include +#include +#include + +// template +// std::vector eye(int64_t n) { +// std::vector A(n * n, 0.0); +// for (int i = 0; i < n; ++i) +// A[i + n*i] = 1.0; +// return A; +// } + +using RandBLAS::RNGState; + +template +RNGState left_multiply_by_orthmat(int64_t m, int64_t n, std::vector &A, RNGState state) { + using std::vector; + vector U(m * m, 0.0); + RandBLAS::DenseDist DU(m, m); + auto out_state = RandBLAS::fill_dense(DU, U.data(), state); + vector tau(m, 0.0); + lapack::geqrf(m, m, U.data(), m, tau.data()); + lapack::ormqr(blas::Side::Left, blas::Op::NoTrans, m, n, m, U.data(), m, tau.data(), A.data(), m); + return out_state; +} + +template +void full_gram(int64_t n, std::vector &A, blas::Op op, int64_t k = -1) { + std::vector work(A); + auto uplo = blas::Uplo::Upper; + auto layout = blas::Layout::ColMajor; + if (k == -1) { + k = n; + } else { + randblas_require(op == blas::Op::NoTrans); + } + blas::syrk(layout, uplo, op, n, k, 1.0, work.data(), n, 0.0, A.data(), n); + RandBLAS::symmetrize(layout, uplo, n, A.data(), n); +} + +class TestRPCholesky : public ::testing::Test { + protected: + + virtual void SetUp() {}; + + virtual void TearDown() {}; + + template + void run_exact(int64_t n, FUNC &A, T* Abuff, int64_t b, T atol, T rtol, uint32_t seed) { + using std::vector; + + int64_t k = n; + vector F(n*k, 0.0); + vector selection(k, -1); + RandBLAS::RNGState state_in(seed); + auto state_out = RandLAPACK::rp_cholesky(n, A, k, selection.data(), F.data(), b, state_in); + + vector Arecovered(F); + full_gram(n, Arecovered, blas::Op::NoTrans, k); + test::comparison::matrices_approx_equal( + blas::Layout::ColMajor, blas::Op::NoTrans, n, n, Abuff, n, Arecovered.data(), n, __PRETTY_FUNCTION__, __FILE__, __LINE__, + atol, rtol + ); + // Check that the pivots are reasonable and nontrivial (i.e., not the sequence from 0 to n-1). + std::set selection_unique{}; + for (auto pivot : selection) { + if (pivot != -1) + selection_unique.insert(pivot); + } + ASSERT_EQ(selection_unique.size(), k) << "using seed " << seed; + if (n > 4) + ASSERT_FALSE(std::is_sorted(selection.begin(), selection.end())) << "using seed " << seed; + // ^ is_sorted() checks if we're in increasing order + return; + } + + template + void run_exact_diag(int64_t n, int64_t b, int64_t power, uint32_t seed) { + std::vector Avec(n * n, 0.0); + for (int64_t i = 0; i < n; ++i) + Avec[i + n*i] = std::pow((T) i + 1, power); + auto Abuff = Avec.data(); + auto A = [Abuff, n](int64_t i, int64_t j) { return Abuff[i + n*j]; }; + + T atol = std::sqrt(n) * std::numeric_limits::epsilon(); + T rtol = std::sqrt(n) * std::numeric_limits::epsilon(); + run_exact(n, A, Abuff, b, atol, rtol, seed); + return; + } + + template + void run_exact_kahan_gram(int64_t n, int64_t b, uint32_t seed) { + using std::vector; + vector Avec(n * n, 0.0); + T theta = 1.2; + T perturb = 10; + RandLAPACK::gen::gen_kahan_mat(n, n, Avec.data(), theta, perturb); + vector kahan(Avec); + full_gram(n, Avec, blas::Op::Trans); + // ^ Avec now represents the Gram matrix of the Kahan matrix. + + std::vector gk_chol(Avec); + // ^ We'll run Cholesky on the Gram matrix of the Kahan matrix, + // and compare to the Kahan matrix itself. This helps us get + // a realistic tolerance considering the numerical nastyness + // of the Kahan matrix. + auto status = lapack::potrf(blas::Uplo::Upper, n, gk_chol.data(), n); + randblas_require(status == 0); + T atol = 0.0; + RandLAPACK::util::get_U(n, n, gk_chol.data(), n); + for (int64_t i = 0; i < n*n; ++i) { + T val1 = std::abs(kahan[i] - gk_chol[i]); + T val2 = std::abs(kahan[i] + gk_chol[i]); + atol = std::max(atol, std::min(val1, val2)); + } + atol = std::sqrt(n) * atol; + + T* Abuff = Avec.data(); + auto A = [Abuff, n](int64_t i, int64_t j) { return Abuff[i + n*j]; }; + run_exact(n, A, Abuff, b, atol, atol, seed); + // ^ use the same value for rtol and atol + return; + } +}; + + +TEST_F(TestRPCholesky, test_exact_diag_b1) { + for (uint32_t i = 2012; i < 2019; ++i) { + run_exact_diag(5, 1, 2, i); + run_exact_diag(10, 1, 1, i); + run_exact_diag(10, 1, 2, i); + run_exact_diag(13, 1, 2, i); + run_exact_diag(100, 1, 2, i); + } +} + +TEST_F(TestRPCholesky, test_exact_diag_b2) { + for (uint32_t i = 2012; i < 2019; ++i) { + run_exact_diag(10, 2, 1, i); + run_exact_diag(10, 2, 2, i); + run_exact_diag(100, 2, 2, i); + } +} + +TEST_F(TestRPCholesky, test_exact_kahan_gram_b1) { + for (uint32_t i = 2012; i < 2019; ++i) { + run_exact_kahan_gram(5, 1, i); + run_exact_kahan_gram(10, 1, i); + } +} + +TEST_F(TestRPCholesky, test_exact_kahan_gram_b2) { + for (uint32_t i = 2012; i < 2019; ++i) { + run_exact_kahan_gram(10, 2, i); + run_exact_kahan_gram(11, 2, i); + run_exact_kahan_gram(12, 2, i); + } +} + +TEST_F(TestRPCholesky, test_exact_kahan_gram_b3) { + for (uint32_t i = 2012; i < 2019; ++i) { + run_exact_kahan_gram(9, 3, i); + run_exact_kahan_gram(10, 3, i); + run_exact_kahan_gram(11, 3, i); + run_exact_kahan_gram(12, 3, i); + } +} diff --git a/test/drivers/test_cqrrp.cc b/test/drivers/test_cqrrp.cc index e724f600..f10b24e2 100644 --- a/test/drivers/test_cqrrp.cc +++ b/test/drivers/test_cqrrp.cc @@ -1,3 +1,4 @@ +#if !defined(__APPLE__) #include "RandLAPACK.hh" #include "rl_blaspp.hh" #include "rl_lapackpp.hh" @@ -146,7 +147,6 @@ class TestCQRRP : public ::testing::Test }; -#if !defined(__APPLE__) // Note: If Subprocess killed exception -> reload vscode TEST_F(TestCQRRP, CQRRP_blocked_full_rank_basic) { int64_t m = 5000;//5000; diff --git a/test/drivers/test_hqrrp.cc b/test/drivers/test_hqrrp.cc index ffa09f2b..5ade37e0 100644 --- a/test/drivers/test_hqrrp.cc +++ b/test/drivers/test_hqrrp.cc @@ -7,7 +7,6 @@ #include #include - class TestHQRRP : public ::testing::Test { protected: diff --git a/test/drivers/test_krillx.cc b/test/drivers/test_krillx.cc new file mode 100644 index 00000000..77247c75 --- /dev/null +++ b/test/drivers/test_krillx.cc @@ -0,0 +1,189 @@ +#include +#include +#include +#include +#include +#include + +#include "../moremats.hh" +#include "../RandLAPACK/RandBLAS/test/comparison.hh" + + +using std::vector; +using blas::Layout; +using blas::Op; +using RandBLAS::DenseDist; +using RandBLAS::SparseDist; +using RandBLAS::RNGState; +using RandLAPACK::linops::RegExplicitSymLinOp; +using RandLAPACK::linops::RBFKernelMatrix; +using RandLAPACK_Testing::polynomial_decay_psd; + + +class TestKrillIsh: public ::testing::Test { + + protected: + static inline int64_t m = 1000; + static inline vector keys = {42, 1}; + + virtual void SetUp() {}; + + virtual void TearDown() {}; + + template + void run_common(T mu_min, vector &V, vector &lambda, RegExplicitSymLinOp &G_linop) { + RandLAPACK::linops::SpectralPrecond invP(m); + vector mus {mu_min, mu_min/10, mu_min/100}; + G_linop.regs = mus; + G_linop.set_eval_includes_reg(true); + invP.prep(V, lambda, mus, mus.size()); + int64_t s = mus.size(); + + vector X_star(m*s, 0.0); + vector X_init(m*s, 0.0); + vector H(m*s, 0.0); + RNGState state0(101); + DenseDist DX_star {m, s, RandBLAS::ScalarDist::Gaussian}; + auto Xsd = X_star.data(); + auto state1 = RandBLAS::fill_dense(DX_star, Xsd, state0); + G_linop(blas::Layout::ColMajor, s, 1.0, X_star.data(), m, 0.0, H.data(), m); + + std::cout << "\nFrobenius norm of optimal solution : " << blas::nrm2(m*s, X_star.data(), 1); + std::cout << "\nFrobenius norm of right-hand-side : " << blas::nrm2(m*s, H.data(), 1) << std::endl; + RandLAPACK::StatefulFrobeniusNorm seminorm{}; + T tol = 100*std::numeric_limits::epsilon(); + int64_t max_iters = 30; + RandLAPACK::lockorblock_pcg(G_linop, H, tol, max_iters, invP, seminorm, X_init, true); + + T tol_scale = std::sqrt((T)m); + T atol = tol_scale * std::pow(std::numeric_limits::epsilon(), 0.5); + T rtol = tol_scale * atol; + test::comparison::buffs_approx_equal(X_init.data(), X_star.data(), m * s, + __PRETTY_FUNCTION__, __FILE__, __LINE__, atol, rtol + ); + return; + } + + template + void run_nystrom(int key_index, vector &G) { + /* Run the algorithm under test */ + RNGState alg_state(keys[key_index]); + alg_state.key.incr(); + vector V(0); + vector lambda(0); + int64_t k = 64; + T mu_min = 1e-5; + vector regs{}; + RegExplicitSymLinOp G_linop(m, G.data(), m, regs); + RandLAPACK::nystrom_pc_data( + G_linop, V, lambda, k, mu_min/10, alg_state + ); // k has been updated. + EXPECT_TRUE(k > 5); + EXPECT_TRUE(k < m); + run_common(mu_min, V, lambda, G_linop); + } + + template + void run_rpchol(int key_index, vector &G) { + RNGState alg_state(keys[key_index]); + alg_state.key.incr(); + int64_t k = 128; + vector V(m*k); + vector lambda(k); + T mu_min = 1e-5; + int64_t rp_chol_block_size = 4; + vector regs{}; + RegExplicitSymLinOp G_linop(m, G.data(), m, regs); + RandLAPACK::rpchol_pc_data(m, G_linop, k, rp_chol_block_size, V.data(), lambda.data(), alg_state); + EXPECT_TRUE(k == 128); + run_common(mu_min, V, lambda, G_linop); + } +}; + +TEST_F(TestKrillIsh, test_manual_lockstep_nystrom) { + for (int64_t decay = 2; decay < 4; ++decay) { + auto G = polynomial_decay_psd(m, 1e12, (double) decay, 99); + run_nystrom(0, G); + run_nystrom(1, G); + } +} + +TEST_F(TestKrillIsh, test_manual_lockstep_rpchol) { + auto G = polynomial_decay_psd(m, 1e12, 2.0, 99); + run_rpchol(0, G); + run_rpchol(1, G); +} + + +class TestKrillx: public ::testing::Test { + + protected: + static inline int64_t m = 1000; + static inline vector keys = {42, 1}; + + virtual void SetUp() {}; + + virtual void TearDown() {}; + + template + void run_krill_separable(int key_index, RELO &G_linop, int64_t k) { + using T = typename RELO::scalar_t; + int64_t s = G_linop.regs.size(); + + vector X_star(m*s, 0.0); + vector X_init(m*s, 0.0); + vector H(m*s, 0.0); + RNGState state0(101); + DenseDist DX_star {m, s, RandBLAS::ScalarDist::Gaussian}; + auto Xsd = X_star.data(); + auto state1 = RandBLAS::fill_dense(DX_star, Xsd, state0); + G_linop.set_eval_includes_reg(true); + G_linop(blas::Layout::ColMajor, s, 1.0, X_star.data(), m, 0.0, H.data(), m); + std::cout << "\nFrobenius norm of optimal solution : " << blas::nrm2(m*s, X_star.data(), 1); + std::cout << "\nFrobenius norm of right-hand-side : " << blas::nrm2(m*s, H.data(), 1) << std::endl; + + RandLAPACK::StatefulFrobeniusNorm seminorm{}; + T tol = 100*std::numeric_limits::epsilon(); + int64_t max_iters = 30; + int64_t rpc_blocksize = 16; + RNGState state2(keys[key_index]); + RandLAPACK::krill_full_rpchol( + m, G_linop, H, X_init, tol, state2, seminorm, rpc_blocksize, max_iters, k + ); + T tol_scale = std::sqrt((T)m); + T atol = tol_scale * std::pow(std::numeric_limits::epsilon(), 0.5); + T rtol = tol_scale * atol; + test::comparison::buffs_approx_equal(X_init.data(), X_star.data(), m * s, + __PRETTY_FUNCTION__, __FILE__, __LINE__, atol, rtol + ); + return; + } +}; + +TEST_F(TestKrillx, test_krill_full_rpchol) { + using T = double; + T mu_min = 1e-5; + vector mus {mu_min, mu_min/10, mu_min/100}; + for (int64_t decay = 2; decay < 4; ++decay) { + auto G = polynomial_decay_psd(m, 1e12, (T) decay, 99); + RegExplicitSymLinOp G_linop(m, G.data(), m, mus); + int64_t k = 128; + run_krill_separable(0, G_linop, k); + run_krill_separable(1, G_linop, k); + } +} + +TEST_F(TestKrillx, test_krill_separable_squared_exp_kernel) { + using T = double; + T mu_min = 1e-2; + vector mus {mu_min, mu_min*10, mu_min*100}; + for (uint32_t key = 0; key < 5; ++key) { + //auto G = polynomial_decay_psd(m, 1e12, (T) decay, key); + //RegExplicitSymLinOp G_linop(m, G.data(), m, mus); + vector X0 = RandLAPACK_Testing::random_gaussian_mat(5, m, key); + RBFKernelMatrix G_linop(m, X0.data(), 5, 3.0, mus); + int64_t k = 128; + run_krill_separable(0, G_linop, k); + run_krill_separable(1, G_linop, k); + } +} diff --git a/test/misc/test_linops.cc b/test/misc/test_linops.cc new file mode 100644 index 00000000..af86ab85 --- /dev/null +++ b/test/misc/test_linops.cc @@ -0,0 +1,112 @@ +#include +#include +#include +#include +#include +#include +#include "../RandLAPACK/RandBLAS/test/comparison.hh" + + +using std::vector; +using blas::Layout; +using blas::Op; +using RandBLAS::DenseDist; +using RandBLAS::SparseDist; +using RandBLAS::RNGState; + + +/** + * Note: a few implicit linear operators are tested implicitly (ha) in + * test_determiter.cc. It's important to have tests for these things + * since bugs in their implementation can be hard to track down. + */ + + +class TestSpectralPrecondLinearOperator: public ::testing::Test { + + protected: + + virtual void SetUp() {}; + + virtual void TearDown() {}; + + // Run on a diagonal matrix with an optimal rank-k preconditioner. + template + void run_diag(int64_t n, int64_t k, T mu) { + int64_t i; + vector alleigs(n); + vector allV(n*n, 0.0); + for (i = 0; i < n; ++i) { + alleigs[i] = std::pow((T)i + (T)1.0, (T) -3.0); + allV[i + i*n] = 1.0; + } + + vector G_mu(n*n, 0.0); + for (i = 0; i < n; ++i) { + G_mu[i + i*n] = alleigs[i] + mu; + } + + vector pceigs(k); + vector pcV(n*k, 0.0); + for (i = 0; i < k; ++i) { + pceigs[i] = alleigs[i]; + pcV[i + i*n] = 1.0; + } + vector G_mu_pre_expect(n*n, 0.0); + T scale_on_precond_subspace = alleigs[k-1] + mu; + for (i = 0; i < n; ++i) { + if (i < k) { + G_mu_pre_expect[i + i*n] = scale_on_precond_subspace; + } else { + G_mu_pre_expect[i + i*n] = alleigs[i] + mu; + } + } + RandLAPACK::linops::SpectralPrecond invP_operator(n); + vector mus(1, mu); + invP_operator.prep(pcV, pceigs, mus, n); + vector G_mu_pre_actual(n*n, 0.0); + invP_operator.evaluate(n, G_mu.data(), G_mu_pre_actual.data()); + test::comparison::matrices_approx_equal( + Layout::ColMajor, Op::NoTrans, n, n, G_mu_pre_actual.data(), n, + G_mu_pre_expect.data(), n, __PRETTY_FUNCTION__, + __FILE__, __LINE__ + ); + return; + } +}; + +TEST_F(TestSpectralPrecondLinearOperator, test_diag_n3_k1) { + run_diag(3, 1, 0.1); +} + +TEST_F(TestSpectralPrecondLinearOperator, test_diag_n3_k2) { + run_diag(3, 2, 0.1); +} + +TEST_F(TestSpectralPrecondLinearOperator, test_diag_n4_k1) { + run_diag(4, 1, 0.1); +} + +TEST_F(TestSpectralPrecondLinearOperator, test_diag_n4_k2) { + run_diag(4, 2, 0.1); +} + +TEST_F(TestSpectralPrecondLinearOperator, test_diag_n4_k3) { + run_diag(4, 3, 0.1); +} + +TEST_F(TestSpectralPrecondLinearOperator, test_diag_n5_k1) { + run_diag(5, 1, 0.1); +} + +TEST_F(TestSpectralPrecondLinearOperator, test_diag_n5_k2) { + run_diag(5, 2, 0.1); +} + +TEST_F(TestSpectralPrecondLinearOperator, test_diag_n5_k3) { + run_diag(5, 3, 0.1); +} + +TEST_F(TestSpectralPrecondLinearOperator, test_diag_n5_k4) { + run_diag(5, 4, 0.1); +} diff --git a/test/misc/test_pdkernels.cc b/test/misc/test_pdkernels.cc new file mode 100644 index 00000000..24b02f46 --- /dev/null +++ b/test/misc/test_pdkernels.cc @@ -0,0 +1,268 @@ +#include "RandLAPACK.hh" +#include "rl_blaspp.hh" +#include "rl_gen.hh" + +#include +#include "../RandLAPACK/RandBLAS/test/comparison.hh" +#include "../moremats.hh" + +#include +#include + +using RandBLAS::RNGState; +using RandBLAS::DenseDist; +using blas::Layout; +using std::vector; + +class TestPDK_SquaredExponential : public ::testing::Test { + protected: + + virtual void SetUp() {}; + + virtual void TearDown() {}; + + /** + * Test that squared_exp_kernel_submatrix gives the same result + * as calls to squared_exp_kernel. + */ + template + void run_same_blockimpl_vs_entrywise(int64_t d, int64_t n, T bandwidth, uint32_t seed) { + vector K_blockimpl(n*n, 0.0); + vector K_entrywise(n*n, 0.0); + vector X = RandLAPACK_Testing::random_gaussian_mat(d, n, seed); + vector squared_norms(n, 0.0); + T* X_ = X.data(); + for (int64_t i = 0; i < n; ++i) { + squared_norms[i] = std::pow(blas::nrm2(d, X_ + i*d, 1), 2); + } + RandLAPACK::squared_exp_kernel_submatrix( + d, n, X_, squared_norms.data(), n, n, K_blockimpl.data(), 0, 0, bandwidth + ); + for (int64_t j = 0; j < n; ++j) { + for (int64_t i = 0; i < n; ++i) { + T* xi = X.data() + i*d; + T* xj = X.data() + j*d; + K_entrywise[i + j*n] = RandLAPACK::squared_exp_kernel(d, xi, xj, bandwidth); + } + } + T atol = 3 * d * std::numeric_limits::epsilon() * (1.0 + std::pow(bandwidth, -2)); + test::comparison::matrices_approx_equal( + blas::Layout::ColMajor, blas::Op::NoTrans, n, n, K_blockimpl.data(), n, + K_entrywise.data(), n, __PRETTY_FUNCTION__, __FILE__, __LINE__, atol, atol + ); + return; + } + + /** + * Test that if all of X's columns are the same then the squared exponential kernel + * gives a matrix of all ones. + */ + template + void run_all_same_column(int64_t d, int64_t n, uint32_t seed) { + vector c = RandLAPACK_Testing::random_gaussian_mat(d, 1, seed); + vector X(d*n, 0.0); + T* _X = X.data(); + T* _c = c.data(); + for (int64_t i = 0; i < n; ++i) { + blas::copy(d, _c, 1, _X + i*d, 1); + } + T sqnorm = std::pow(blas::nrm2(d, _c, 1), 2); + vector squarednorms(n, sqnorm); + vector K(n*n, 0.0); + T bandwidth = 2.3456; + RandLAPACK::squared_exp_kernel_submatrix( + d, n, _X, squarednorms.data(), n, n, K.data(), 0, 0, bandwidth + ); + vector expected(n*n, 1.0); + test::comparison::matrices_approx_equal( + blas::Layout::ColMajor, blas::Op::NoTrans, n, n, K.data(), n, + expected.data(), n, __PRETTY_FUNCTION__, __FILE__, __LINE__ + ); + return; + } + + /** + * Test that if the columns of X are orthonormal then the diagonal + * will be all ones and the off-diagonal will be exp(-bandwidth^{-2}); + * this needs to vary with different values for the bandwidth. + */ + template + void run_orthogonal(int64_t n, T bandwidth, uint32_t seed) { + std::vector X(n*n, 0.0); + for (int64_t i = 0; i < n; ++i) + X[i+i*n] = 1.0; + RNGState state(seed); + RandLAPACK_Testing::left_multiply_by_orthmat(n, n, X, state); + vector squarednorms(n, 1.0); + vector K(n*n, 0.0); + RandLAPACK::squared_exp_kernel_submatrix( + n, n, X.data(), squarednorms.data(), n, n, K.data(), 0, 0, bandwidth + ); + T offdiag = std::exp(-std::pow(bandwidth, -2)); + std::vector expect(n*n); + for (int64_t j = 0; j < n; ++j) { + for (int64_t i = 0; i < n; ++i) { + if (i == j) { + expect[i+j*n] = 1.0; + } else { + expect[i+j*n] = offdiag; + } + } + } + T atol = 50 * std::numeric_limits::epsilon(); + test::comparison::matrices_approx_equal( + blas::Layout::ColMajor, blas::Op::NoTrans, n, n, K.data(), n, + expect.data(), n, __PRETTY_FUNCTION__, __FILE__, __LINE__, atol, atol + ); + return; + } + +}; + +TEST_F(TestPDK_SquaredExponential, test_repeated_columns) { + for (uint32_t i = 10; i < 15; ++i) { + run_all_same_column(3, 9, i); + run_all_same_column(9, 3, i); + } +} + + +TEST_F(TestPDK_SquaredExponential, test_blockimpl_vs_entrywise_full_matrix_d_3_n_10) { + for (uint32_t i = 2; i < 7; ++i) { + run_same_blockimpl_vs_entrywise(3, 10, 1.0, i); + run_same_blockimpl_vs_entrywise(3, 10, 0.2, i); + run_same_blockimpl_vs_entrywise(3, 10, 5.9, i); + } +} + +TEST_F(TestPDK_SquaredExponential, test_blockimpl_vs_entrywise_full_matrix_d_10_n_3) { + for (uint32_t i = 2; i < 7; ++i) { + run_same_blockimpl_vs_entrywise(10, 3, 1.0, i); + run_same_blockimpl_vs_entrywise(10, 3, 0.2, i); + run_same_blockimpl_vs_entrywise(10, 3, 5.9, i); + } +} + +TEST_F(TestPDK_SquaredExponential, test_orthogonal_columns) { + for (uint32_t i = 70; i < 75; ++i) { + run_orthogonal(5, 0.5, i); + run_orthogonal(5, 1.1, i); + run_orthogonal(5, 3.0, i); + } +} + + +class TestPDK_RBFKernelMatrix : public ::testing::Test { + protected: + + virtual void SetUp() {}; + + virtual void TearDown() {}; + + template + void run(T bandwidth, T reg, int64_t m, int64_t d, uint32_t seed, bool use_reg = true) { + RNGState state_x(seed); + DenseDist D(d, m); + vector X_vec(d*m); + T* X = X_vec.data(); + RandBLAS::fill_dense(D, X, state_x); + vector regs(1,reg); + RandLAPACK::linops::RBFKernelMatrix K(m, X, d, bandwidth, regs); + K.set_eval_includes_reg(use_reg); + + vector eye(m * m, 0.0); + vector sq_colnorms(m, 0.0); + for (int64_t i = 0; i < m; ++i) { + eye[i + m*i] = 1.0; + sq_colnorms[i] = std::pow(blas::nrm2(d, X + i*d, 1), 2); + } + vector K_out_expect(m * m, 0.0); + + // (alpha, beta) = (0.25, 0.0), + T alpha = 0.25; + RandLAPACK::squared_exp_kernel_submatrix( + d, m, X, sq_colnorms.data(), m, m, K_out_expect.data(), 0, 0, bandwidth + ); + blas::scal(m * m, alpha, K_out_expect.data(), 1); + if (use_reg) { + for (int i = 0; i < m; ++i) + K_out_expect[i + i*m] += alpha * reg; + } + vector K_out_actual1(m * m, 1.0); + K(blas::Layout::ColMajor, m, alpha, eye.data(), m, 0.0, K_out_actual1.data(), m); + + T atol = d * std::numeric_limits::epsilon() * (1.0 + std::pow(bandwidth, -2)); + test::comparison::matrices_approx_equal( + blas::Layout::ColMajor, blas::Op::NoTrans, m, m, K_out_actual1.data(), m, + K_out_expect.data(), m, __PRETTY_FUNCTION__, __FILE__, __LINE__, atol, atol + ); + + // Expected output when (alpha, beta) = (0.25, 0.3) + T beta = 0.3; + for (int i = 0; i < m*m; ++i) + K_out_expect[i] += beta; + vector K_out_actual2(m * m, 1.0); + K(blas::Layout::ColMajor, m, alpha, eye.data(), m, beta, K_out_actual2.data(), m); + + test::comparison::matrices_approx_equal( + blas::Layout::ColMajor, blas::Op::NoTrans, m, m, K_out_actual2.data(), m, + K_out_expect.data(), m, __PRETTY_FUNCTION__, __FILE__, __LINE__, atol, atol + ); + return; + } + +}; + +TEST_F(TestPDK_RBFKernelMatrix, apply_to_eye_m100_d3) { + double mu = 0.123; + for (uint32_t i = 77; i < 80; ++i) { + run(1.0, mu, 100, 3, i, false); + run(2.0, mu, 100, 3, i, false); + run(2.345678, mu, 100, 3, i, false); + } +} + +TEST_F(TestPDK_RBFKernelMatrix, apply_to_eye_m256_d4) { + double mu = 0.123; + for (uint32_t i = 77; i < 80; ++i) { + run(1.0, mu, 256, 4, i, false); + run(2.0, mu, 256, 4, i, false); + run(2.345678, mu, 256, 4, i, false); + } +} + +TEST_F(TestPDK_RBFKernelMatrix, apply_to_eye_m999_d7) { + double mu = 0.123; + for (uint32_t i = 77; i < 80; ++i) { + run(1.0, mu, 999, 7, i, false); + run(2.0, mu, 999, 7, i, false); + run(2.345678, mu, 999, 7, i, false); + } +} + +TEST_F(TestPDK_RBFKernelMatrix, reg_apply_to_eye_m100_d3) { + double bandwidth = 1.1; + for (uint32_t i = 77; i < 80; ++i) { + run(bandwidth, 0.1, 100, 3, i); + run(bandwidth, 1.0, 100, 3, i); + run(bandwidth, 7.654321, 100, 3, i); + } +} + +TEST_F(TestPDK_RBFKernelMatrix, reg_apply_to_eye_m256_d4) { + double bandwidth = 1.1; + for (uint32_t i = 77; i < 80; ++i) { + run(bandwidth, 0.1, 256, 4, i); + run(bandwidth, 1.0, 256, 4, i); + run(bandwidth, 7.654321, 256, 4, i); + } +} + +TEST_F(TestPDK_RBFKernelMatrix, reg_apply_to_eye_m257_d5) { + double bandwidth = 1.1; + for (uint32_t i = 77; i < 80; ++i) { + run(bandwidth, 0.1, 257, 5, i); + run(bandwidth, 1.0, 257, 5, i); + run(bandwidth, 7.654321, 257, 5, i); + } +} \ No newline at end of file diff --git a/test/comps/test_util.cc b/test/misc/test_util.cc similarity index 99% rename from test/comps/test_util.cc rename to test/misc/test_util.cc index a2a80b80..9474ce03 100644 --- a/test/comps/test_util.cc +++ b/test/misc/test_util.cc @@ -349,10 +349,10 @@ class Test_Inplace_Square_Transpose : public ::testing::Test virtual void apply(blas::Layout layout) { int64_t n = 37; - RandBLAS::DenseDist D{n, n}; + RandBLAS::DenseDist D(n, n); RandBLAS::RNGState state(1); double *A1 = new double[n*n]; - state = RandBLAS::fill_dense(D, A1, state).second; + state = RandBLAS::fill_dense(D, A1, state); double *A2 = new double[n*n]; blas::copy(n*n, A1, 1, A2, 1); RandLAPACK::util::transpose_square(A2, n); diff --git a/test/moremats.hh b/test/moremats.hh new file mode 100644 index 00000000..d91eb992 --- /dev/null +++ b/test/moremats.hh @@ -0,0 +1,60 @@ +#pragma once + +#include +#include +#include +#include +#include +#include +#include "../RandLAPACK/RandBLAS/test/comparison.hh" + + +namespace RandLAPACK_Testing { + +using std::vector; +using blas::Layout; +using blas::Op; +using blas::Uplo; +using RandBLAS::RNGState; + +template +vector polynomial_decay_psd(int64_t m, T cond_num, T exponent, uint32_t seed) { + RandLAPACK::gen::mat_gen_info mat_info(m, m, RandLAPACK::gen::polynomial); + mat_info.cond_num = std::sqrt(cond_num); + mat_info.rank = m; + mat_info.exponent = std::sqrt(exponent); + mat_info.frac_spectrum_one = 0.05; + vector A(m * m, 0.0); + RNGState data_state(seed); + RandLAPACK::gen::mat_gen(mat_info, A.data(), data_state); + vector G(m * m, 0.0); + blas::syrk(Layout::ColMajor, Uplo::Upper, Op::NoTrans, m, m, 1.0, + A.data(), m, 0.0, G.data(), m + ); // Note: G is PSD with squared spectrum of A. + RandBLAS::symmetrize(Layout::ColMajor, Uplo::Upper, m, G.data(), m); + return G; +} + +template +vector random_gaussian_mat(int64_t m, int64_t n, uint32_t seed) { + RandBLAS::DenseDist D(m, n); + RNGState state(seed); + vector mat(m*n); + RandBLAS::fill_dense(D, mat.data(), state); + return mat; +} + +template +RNGState left_multiply_by_orthmat(int64_t m, int64_t n, std::vector &A, RNGState state) { + using std::vector; + vector U(m * m, 0.0); + RandBLAS::DenseDist DU(m, m); + auto out_state = RandBLAS::fill_dense(DU, U.data(), state); + vector tau(m, 0.0); + lapack::geqrf(m, m, U.data(), m, tau.data()); + lapack::ormqr(blas::Side::Left, blas::Op::NoTrans, m, n, m, U.data(), m, tau.data(), A.data(), m); + return out_state; +} + + +} \ No newline at end of file