diff --git a/.gitignore b/.gitignore
index 1cdca533..b140e8a4 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,4 +1,6 @@
 benchmark/build/**
+benchmark/bench_kernelalgs/**.py
+benchmark/bench_kernelalgs/rpcpy
 
 # vim
 *.sw*
diff --git a/CMake/rl_version.cmake b/CMake/rl_version.cmake
index 789b7280..48eec794 100644
--- a/CMake/rl_version.cmake
+++ b/CMake/rl_version.cmake
@@ -1,27 +1,54 @@
+# Initialize tmp variable
 set(tmp)
+
+# Find Git executable
 find_package(Git QUIET)
 if(GIT_FOUND)
-    execute_process(COMMAND ${GIT_EXECUTABLE}
-        --git-dir=${CMAKE_SOURCE_DIR}/.git describe
-        --tags --match "[0-9]*.[0-9]*.[0-9]*"
-        OUTPUT_VARIABLE tmp OUTPUT_STRIP_TRAILING_WHITESPACE
-        ERROR_QUIET)
+    message(STATUS "Git found: ${GIT_EXECUTABLE}")
+    execute_process(
+        COMMAND ${GIT_EXECUTABLE} --git-dir=${CMAKE_SOURCE_DIR}/.git describe --tags --match "[0-9]*.[0-9]*.[0-9]*"
+        OUTPUT_VARIABLE tmp
+        OUTPUT_STRIP_TRAILING_WHITESPACE
+        ERROR_VARIABLE git_error
+        RESULT_VARIABLE git_result
+    )
+    
+    # Print the result of the Git command
+    message(STATUS "Git command result: ${git_result}")
+    message(STATUS "Git command output: ${tmp}")
+    if(NOT git_result EQUAL 0)
+        message(WARNING "Git command failed with error: ${git_error}")
+        set(tmp "0.0.0")
+    endif()
+else()
+    message(WARNING "Git not found, using fallback version 0.0.0")
+    set(tmp "0.0.0")
 endif()
+
+# Check if tmp is empty and set a fallback version if necessary
 if(NOT tmp)
+    message(WARNING "Git describe output is empty, using fallback version 0.0.0")
     set(tmp "0.0.0")
 endif()
 
-set(RandLAPACK_VERSION ${tmp} CACHE STRING "RandLAPACK version" FORCE)
+# Debugging: Print tmp before setting RandLAPACK_VERSION
+message(STATUS "tmp before setting RandLAPACK_VERSION: ${tmp}")
 
-string(REGEX REPLACE "^([0-9]+)\\.([0-9]+)\\.([0-9]+)(.*$)"
-  "\\1" RandLAPACK_VERSION_MAJOR ${RandLAPACK_VERSION})
+# Set RandLAPACK_VERSION without CACHE option
+set(RandLAPACK_VERSION "${tmp}")
+message(STATUS "RandLAPACK_VERSION after setting: ${RandLAPACK_VERSION}")
 
-string(REGEX REPLACE "^([0-9]+)\\.([0-9]+)\\.([0-9]+)(.*$)"
-  "\\2" RandLAPACK_VERSION_MINOR ${RandLAPACK_VERSION})
+# Ensure RandLAPACK_VERSION is not empty
+if(NOT RandLAPACK_VERSION)
+    message(FATAL_ERROR "RandLAPACK_VERSION is empty")
+endif()
 
-string(REGEX REPLACE "^([0-9]+)\\.([0-9]+)\\.([0-9]+)(.*$)"
-  "\\3" RandLAPACK_VERSION_PATCH ${RandLAPACK_VERSION})
+# Extract major, minor, and patch versions
+string(REGEX REPLACE "^([0-9]+)\\.([0-9]+)\\.([0-9]+)(.*)$" "\\1" RandLAPACK_VERSION_MAJOR "${RandLAPACK_VERSION}")
+string(REGEX REPLACE "^([0-9]+)\\.([0-9]+)\\.([0-9]+)(.*)$" "\\2" RandLAPACK_VERSION_MINOR "${RandLAPACK_VERSION}")
+string(REGEX REPLACE "^([0-9]+)\\.([0-9]+)\\.([0-9]+)(.*)$" "\\3" RandLAPACK_VERSION_PATCH "${RandLAPACK_VERSION}")
 
+# Print extracted version components
 message(STATUS "RandLAPACK_VERSION_MAJOR=${RandLAPACK_VERSION_MAJOR}")
 message(STATUS "RandLAPACK_VERSION_MINOR=${RandLAPACK_VERSION_MINOR}")
 message(STATUS "RandLAPACK_VERSION_PATCH=${RandLAPACK_VERSION_PATCH}")
diff --git a/RandBLAS b/RandBLAS
index 172d0963..5ca3b3e5 160000
--- a/RandBLAS
+++ b/RandBLAS
@@ -1 +1 @@
-Subproject commit 172d0963f16743defa646b32e7e0279b52230f99
+Subproject commit 5ca3b3e573f2a7c3509cc5362bf3a00a7e8e2ff6
diff --git a/RandLAPACK.hh b/RandLAPACK.hh
index 6fceabdb..c3818ae7 100644
--- a/RandLAPACK.hh
+++ b/RandLAPACK.hh
@@ -1,10 +1,16 @@
 #ifndef RANDLAPACK_HH
 #define RANDLAPACK_HH
 
+// config and dependencies
+#include "RandLAPACK/rl_blaspp.hh"
+#include "RandLAPACK/rl_lapackpp.hh"
+#include "RandBLAS.hh"
+
 // misc
 #include "RandLAPACK/misc/rl_util.hh"
 #include "RandLAPACK/misc/rl_linops.hh"
 #include "RandLAPACK/misc/rl_gen.hh"
+#include "RandLAPACK/misc/rl_pdkernels.hh"
 
 // Computational routines
 #include "RandLAPACK/comps/rl_determiter.hh"
@@ -15,6 +21,7 @@
 #include "RandLAPACK/comps/rl_syps.hh"
 #include "RandLAPACK/comps/rl_syrf.hh"
 #include "RandLAPACK/comps/rl_orth.hh"
+#include "RandLAPACK/comps/rl_rpchol.hh"
 
 // Drivers
 #include "RandLAPACK/drivers/rl_rsvd.hh"
@@ -22,6 +29,7 @@
 #include "RandLAPACK/drivers/rl_cqrrp.hh"
 #include "RandLAPACK/drivers/rl_revd2.hh"
 #include "RandLAPACK/drivers/rl_rbki.hh"
+#include "RandLAPACK/drivers/rl_krillx.hh"
 
 // Cuda functions - issues with linking/visibility when present if the below is uncommented.
 // A temporary fix is to add the below directly in the test/benchmark files.
diff --git a/RandLAPACK/CMakeLists.txt b/RandLAPACK/CMakeLists.txt
index 055e4a12..7c6d90fd 100644
--- a/RandLAPACK/CMakeLists.txt
+++ b/RandLAPACK/CMakeLists.txt
@@ -6,6 +6,7 @@ set(RandLAPACK_cxx_sources
     rl_cqrrp.hh
     rl_rsvd.hh
     rl_revd2.hh
+    rl_krillx.hh
     rl_qb.hh
     rl_orth.hh
     rl_util.hh
@@ -14,6 +15,7 @@ set(RandLAPACK_cxx_sources
     rl_rf.hh
     rl_syps.hh
     rl_syrf.hh
+    rl_rpchol.hh
     rl_gen.hh
     rl_blaspp.hh
     rl_linops.hh
diff --git a/RandLAPACK/comps/rl_determiter.hh b/RandLAPACK/comps/rl_determiter.hh
index d8d292f2..af974edb 100644
--- a/RandLAPACK/comps/rl_determiter.hh
+++ b/RandLAPACK/comps/rl_determiter.hh
@@ -1,6 +1,7 @@
 #pragma once
 
 #include "rl_blaspp.hh"
+#include "rl_linops.hh"
 
 #include <iostream>
 #include <vector>
@@ -8,11 +9,14 @@
 
 namespace RandLAPACK {
 
-// moved run_pcgls_ex to test 
-// void run_pcgls_ex(int n, int m);
+/*  Solve the saddle point problem
+    (A'A + mu*I)x = A'b - c
 
+    Have access to a matrix M such that
+    (A'A + mu*I)MM' is well-conditioned.
+*/
 template <typename T>
-void pcg(
+void pcg_saddle(
     int64_t m,
     int64_t n,
     const T* A,
@@ -28,8 +32,7 @@ void pcg(
     const T* x0, // length n
     T* x,  // length n
     T* y // length m
-    )
-{
+) {
     std::vector<T> out_a1(m, 0.0);
     std::vector<T> out_at1(n, 0.0);
     std::vector<T> out_m1(n, 0.0);
@@ -126,4 +129,315 @@ void pcg(
     blas::gemv(Layout::ColMajor, Op::NoTrans, m, n, -1.0, A, lda, x, 1, 1.0, y, 1);
 }
 
+
+// MARK: [L/B]PCG helpers
+
+template <typename T>
+struct StatefulSeminorm {
+    ~StatefulSeminorm() {};
+    virtual T operator()(int64_t n, int64_t s, const T* NR) = 0;
+};
+
+template <typename T>
+struct StatefulFrobeniusNorm {
+    std::vector<T> history;
+    StatefulFrobeniusNorm() : history() {};
+    inline T operator()(int64_t n, int64_t s, const T* NR) { 
+        T nrm = blas::nrm2(n * s, NR, 1);
+        this->history.push_back(nrm);
+        return nrm;
+    };
+};
+
+template <typename T>
+void zero_off_diagonal(T* mat, int64_t s) {
+    for (int64_t i = 0; i < s - 1; ++i) {
+        T* ptr_to_next_diag = mat + i + i*s;
+        blas::scal(s, 0.0, ptr_to_next_diag + 1, 1);
+    }
+}
+
+/**
+ * A is a symmetric column-major matrix represented by its lower triangle.
+ * 
+ * If A is not PSD then this function returns an error code -(n+2).
+ * If A is (near) zero then this function returns an error code -(n+1).
+ * In all other cases this function returns k = dim(ker(A)).
+ * 
+ * If A is PSD then its trailing n - k columns will be overwritten by a 
+ * matrix B where pinv(A) = BB'.
+ *
+ * @param[in] n matrix dimension
+ * @param[in,out] A buffer for symmetric n-by-n matrix stored in host memory.
+ * @param[in] lda leading dimension of A.
+ * @param[out] work buffer of length >= n; overwritten by the eigenvalues of A.
+ *
+ * @returns k = dim(ker(A))
+ */
+template <typename T>
+int64_t psd_sqrt_pinv(
+    int64_t n,
+    T* A,
+    int64_t lda,
+    T* work
+) {
+    lapack::syevd(lapack::Job::Vec, blas::Uplo::Lower, n, A, lda, work);
+    T rel_tol = 10 * std::numeric_limits<T>::epsilon();
+    T abs_tol = rel_tol * std::max(1.0, work[n - 1]);
+    if (work[0] < -abs_tol) {
+        std::cout << "The input matrix was not positive semidefinite." << std::endl;
+        return -(n + 1);
+    } else if (work[n - 1] < abs_tol) {
+        std::cout << "The input matrix is zero, up to numerical precision." << std::endl;
+        return -(n + 2);
+    }
+    int ker = n;
+    while(ker > 0) {
+        if (work[ker - 1] > abs_tol) {
+            blas::scal(n, 1/std::sqrt(work[ker - 1]), &A[(ker - 1) * n], 1);
+            ker = ker - 1;
+        } else {
+            break;
+        }
+    }
+    return ker;
+}
+
+/** 
+ * Check if LHS is PSD. If it is, then update RHS <- pinv(LHS)*RHS.
+ * 
+ * First we try to Cholesky decompose LHS. If that fails, we compute
+ * its eigendecomposition. If the eigendecomposition shows that LHS
+ * is (close to) the zero matrix or has negative eigenvalues then we
+ * return an error code. Otherwise, we use the eigendecomposition to
+ * perform the update for RHS.
+ * 
+ * @param[in] n
+ *      Matrix dimension
+ * @param[in,out] LHS
+ *      buffer for an n-by-n matrix.
+ *      Contents of this buffer are destroyed.
+ * @param[in] lda
+ *      Leading dimension of LHS.
+ * @param[in,out] RHS
+ *      buffer for n-by-n matrix.
+ * @param[in] ldb
+ *      Leading dimension of RHS.
+ * @param[out] work
+ *     buffer of size >= n*n.
+ * 
+ * @returns k = rank(LHS), or an error code.
+ */
+template <typename T>
+int64_t posm_square(
+    int64_t n,
+    std::vector<T> & LHS,
+    int64_t lda,
+    std::vector<T> & RHS,
+    int64_t ldb,
+    std::vector<T> & work
+) {
+    auto layout = blas::Layout::ColMajor;
+    auto uplo = blas::Uplo::Lower;
+    using blas::Op;
+    using blas::Side;
+    using blas::Diag;
+    assert(n * n <= (int64_t) work.size());
+
+    // Try Cholesky (store a backup of LHS into "work")
+    std::copy(LHS.begin(), LHS.end(), work.begin());
+    int chol_err = lapack::potrf(uplo, n, LHS.data(), lda);
+    if (!chol_err) {
+        blas::trsm(
+            layout, Side::Left, uplo, Op::NoTrans,
+            Diag::NonUnit, n, n, 1.0, LHS.data(), lda, RHS.data(), ldb
+        ); // L y = b
+        blas::trsm(
+            layout, Side::Left, uplo, Op::Trans,
+            Diag::NonUnit, n, n, 1.0, LHS.data(), lda, RHS.data(), ldb
+        ); // L^T x = y
+        return n;
+    } 
+    // Cholesky failed.
+    //      apply pinv(LHS) * RHS by computing an eigendecomposition of LHS.
+    T* LHS_eigvecs = work.data();
+    T* LHS_eigvals = LHS.data();
+    int ker = psd_sqrt_pinv(n, LHS_eigvecs, n, LHS_eigvals);
+    if (ker < 0) {
+        return ker;
+    } else if (ker == n) {
+        T* rhs = RHS.data();
+        for (int i = 0; i < n; ++i) {
+            for (int j = 0; j < n; ++j) {
+                rhs[i + lda*j] = 0.0;
+            }
+        }
+        return 0;
+    }
+    int rank = n - ker;
+    T* pinv_sqrt = &LHS_eigvecs[ker * n];
+    
+    // pinv_sqrt is n-by-rank, and pinv(LHS) = pinv_sqrt * (pinv_sqrt').
+    blas::gemm(
+        layout, Op::Trans, Op::NoTrans, rank, n, n, 1.0, pinv_sqrt, n, RHS.data(), n,  0.0, work.data(), rank
+    ); // work <- pinv_sqrt' * RHS
+    blas::gemm(
+        layout, Op::NoTrans, Op::NoTrans, n, n, rank, 1.0, pinv_sqrt, n, work.data(), rank, 0.0, RHS.data(), n
+    ); // RHS <- pinv_sqrt * work
+    return rank;
+}
+
+namespace hidden {
+
+
+// bool should_stop(int64_t &k, int64_t &stalls, double normNR, double prevnormNR, double normNR0) {
+//     if (normNR < 1e-12 + 1e-9 * normNR0) {
+//         return true;
+//     } else if (normNR > 0.8 * prevnormNR) {
+//         if (stalls < 5) {
+//             stalls++;
+//         } else {
+//             k = -k;
+//             return true;
+//         }
+//     } else {
+//         stalls = 0;
+//     }
+//     return false;
+// }
+
+}
+
+// MARK: [L/B]PCG
+
+template <typename T, typename FG, typename FN, typename FSeminorm>
+void lockorblock_pcg(
+    FG &G,
+    const std::vector<T> &H,
+    T tol,
+    int64_t max_iters,
+    FN &N,
+    FSeminorm &seminorm,
+    std::vector<T> &X,
+    bool verbose = false
+) {
+    int64_t n = G.m;
+    randblas_require(n == N.m);
+    int64_t s = ((int64_t) H.size()) / n;
+    int64_t ns = n*s;
+    int64_t ss = s*s;
+    randblas_require(ns == (int64_t) H.size());
+    randblas_require(ns == (int64_t) X.size());
+    bool treat_as_separable = G.regs.size() > 1;
+    if (treat_as_separable)
+        randblas_require(s == (int64_t) G.regs.size());
+
+    using std::vector;
+
+    vector<T> R(H);
+    vector<T> P(ns, 0.0);
+    vector<T> GP(P);
+    vector<T> NR_or_scratch(P);
+
+    vector<T> RNR(ss, 0.0);
+    vector<T> alpha(RNR);
+    vector<T> beta(RNR);
+    vector<T> more_scratch(RNR);
+    vector<T> alpha_beta_left_buffer(RNR);
+
+    T normNR = INFINITY, prevnormNR = INFINITY;
+
+    auto layout = blas::Layout::ColMajor;
+    using blas::Op;
+
+    G(layout, s, 1.0, X.data(), n, 0.0, GP.data(), n);
+    // ^ GP <- G X
+    blas::axpy(ns, -1.0, GP.data(), 1, R.data(), 1);
+    T normR0 = seminorm(n, s, R.data());
+    // ^ R <- R - G X 
+    N(layout, s, 1.0, R.data(), n, 0.0, P.data(), n);
+    // ^ P <- N R
+    T normNR0 = seminorm(n, s, P.data());
+    blas::gemm(
+        layout, Op::Trans, Op::NoTrans, s, s, n, 1.0, R.data(), n, P.data(), n, 0.0, RNR.data(), s
+    ); // RNR <- R^T P = R^T N R
+    if (treat_as_separable)
+        zero_off_diagonal(RNR.data(), s);
+    alpha = RNR;
+
+    int64_t k = 0;
+    T stop_abstol = tol*(1.0 + normNR0);
+    int64_t subspace_dim = 0;
+    if (verbose)
+        std::cout << "normNR : " << normNR0 << "\tnormR : " << normR0 << "\tk: 0\tdim : 0\n";
+    while (subspace_dim < n && k < max_iters) {
+        // 
+        // Update X and R
+        //
+        k++;
+
+        G(layout, s, (T) 1.0, P.data(), n, (T) 0.0, GP.data(), n);
+        // ^ GP <- G P
+        blas::gemm(
+            layout, Op::Trans, Op::NoTrans, s, s, n, 1.0, P.data(), n, GP.data(), n, 0.0, alpha_beta_left_buffer.data(), s
+        ); // alpha_beta_left_buffer <- P^T G P
+        if (treat_as_separable)
+            zero_off_diagonal(alpha_beta_left_buffer.data(), s);
+
+        int64_t subspace_incr = posm_square(
+            s, alpha_beta_left_buffer, s, alpha, s, more_scratch
+        ); // alpha <- (alpha_beta_left_buffer)^(-1) alpha
+        if (treat_as_separable && subspace_incr > 0)
+            subspace_incr = 1;
+
+        if (subspace_incr < - ((int64_t) s) )
+            break;
+        subspace_dim = subspace_dim + subspace_incr;
+
+        blas::gemm(
+            layout, Op::NoTrans, Op::NoTrans, n, s, s, 1.0, P.data(), n, alpha.data(), s, 1.0, X.data(), n
+        ); // X <- X + P alpha
+        blas::gemm(
+            layout, Op::NoTrans, Op::NoTrans, n, s, s, -1.0, GP.data(), n, alpha.data(), s, 1.0, R.data(), n
+        ); // R <- R - GP alpha
+
+        //
+        //  Check termination criteria
+        //
+        //      TODO: change how we check termination criteria in the event that we're working
+        //            with treat_as_separable = true.
+        T normR = seminorm(n, s, R.data());
+
+        N(layout, s, 1.0, R.data(), n, 0.0, NR_or_scratch.data(), n); // NR <- N R
+        prevnormNR = normNR;
+        normNR = seminorm(n, s, NR_or_scratch.data());
+        if (verbose)
+            std::cout << "normNR : " << normNR << "\tnormR : " << normR << "\tk: " << k << "\tdim : " << subspace_dim << '\n';
+        if (normNR < stop_abstol)
+            break;
+        // 
+        //  Update P, beta, and alpha
+        //
+        alpha_beta_left_buffer = RNR;
+        blas::gemm(
+            layout, blas::Op::Trans, blas::Op::NoTrans, s, s, n, 1.0, R.data(), n, NR_or_scratch.data(), n, 0.0, RNR.data(), s
+        ); // RNR <- R^T NR
+        if (treat_as_separable)
+            zero_off_diagonal(RNR.data(), s);
+        alpha = RNR;
+        beta = alpha;
+        int err = posm_square(
+            s, alpha_beta_left_buffer, s, beta, s, more_scratch
+        ); // beta <- (alpha_beta_left_buffer)^-1 beta
+        if (err < - ((int64_t) s))
+            break;
+        blas::gemm(
+            layout, Op::NoTrans, Op::NoTrans, n, s, s, 1.0, P.data(), n, beta.data(), s, 1.0, NR_or_scratch.data(), n
+        ); // NR_or_scratch <- P * beta
+        P = NR_or_scratch;
+    }
+    return;
+}
+
+
 } // end namespace RandLAPACK
diff --git a/RandLAPACK/comps/rl_preconditioners.hh b/RandLAPACK/comps/rl_preconditioners.hh
index 4f2e9567..13bd7da0 100644
--- a/RandLAPACK/comps/rl_preconditioners.hh
+++ b/RandLAPACK/comps/rl_preconditioners.hh
@@ -3,11 +3,14 @@
 #include "rl_blaspp.hh"
 #include "rl_lapackpp.hh"
 #include "rl_util.hh"
+#include "rl_linops.hh"
+#include "rl_pdkernels.hh"
+
 #include "rl_orth.hh"
 #include "rl_syps.hh"
 #include "rl_syrf.hh"
 #include "rl_revd2.hh"
-#include "rl_linops.hh"
+#include "rl_rpchol.hh"
 
 #include <RandBLAS.hh>
 #include <math.h>
@@ -141,15 +144,11 @@ RandBLAS::RNGState<RNG> rpc_data_svd_saso(
     T *sigma_sk, //buffer of size at least n.
     RandBLAS::RNGState<RNG> state
 ) {
-    RandBLAS::SparseDist D{
-        .n_rows = d,
-        .n_cols = m,
-        .vec_nnz = k
-    };
+    RandBLAS::SparseDist D(d, m, k, RandBLAS::Axis::Short);
     RandBLAS::SparseSkOp<T> S(D, state);
-    auto next_state = RandBLAS::fill_sparse(S);
+    RandBLAS::fill_sparse(S);
     rpc_data_svd(layout, m, n, A, lda, S, V_sk, sigma_sk);
-    return next_state;
+    return S.next_state;
 }
 
 /**
@@ -196,19 +195,23 @@ int64_t make_right_orthogonalizer(
     int64_t n,
     T* V,
     T* sigma,
-    T mu
+    T mu,
+    int64_t cols_V = -1
 ) {
+    if (cols_V < 0) {
+        cols_V = n;
+    }
     double sqrtmu = std::sqrt((double) mu);
     auto regularized = [sqrtmu](T s) {
         return (sqrtmu == 0) ? s : (T) std::hypot((double) s, sqrtmu);
     };
     T curr_s = regularized(sigma[0]);
-    T abstol = curr_s * n * std::numeric_limits<T>::epsilon();
+    T abstol = curr_s * cols_V * std::numeric_limits<T>::epsilon();
     
     int64_t rank = 0;
     int64_t inter_col_stride = (layout == Layout::ColMajor) ? n : 1;
-    int64_t intra_col_stride = (layout == Layout::ColMajor) ? 1 : n;
-    while (rank < n) {
+    int64_t intra_col_stride = (layout == Layout::ColMajor) ? 1 : cols_V;
+    while (rank < cols_V) {
         curr_s = regularized(sigma[rank]);
         if (curr_s < abstol)
             break;
@@ -277,7 +280,7 @@ int64_t make_right_orthogonalizer(
  */
 template <typename T, typename RNG>
 RandBLAS::RNGState<RNG> nystrom_pc_data(
-    SymmetricLinearOperator<T> &A,
+    linops::SymmetricLinearOperator<T> &A,
     std::vector<T> &V,
     std::vector<T> &eigvals,
     int64_t &k,
@@ -314,8 +317,8 @@ RandBLAS::RNGState<RNG> nystrom_pc_data(
  * This wraps a function of the same name that accepts a SymmetricLinearOperator object.
  * The purpose of this wrapper is just to define such an object from data (uplo, A, m).
  */
-template <typename T, typename RNG>
-RandBLAS::RNGState<RNG> nystrom_pc_data(
+template <typename T, typename STATE>
+STATE nystrom_pc_data(
     Uplo uplo,
     const T* A,
     int64_t m,
@@ -323,13 +326,78 @@ RandBLAS::RNGState<RNG> nystrom_pc_data(
     std::vector<T> &eigvals,
     int64_t &k,
     T mu_min,
-    RandBLAS::RNGState<RNG> state,
+    STATE state,
     int64_t num_syps_passes = 3,
     int64_t num_steps_power_iter_error_est = 10
 ) {
-    ExplicitSymLinOp<T> A_linop(m, uplo, A, m, Layout::ColMajor);
+    linops::ExplicitSymLinOp<T> A_linop(m, uplo, A, m, Layout::ColMajor);
     return nystrom_pc_data(A_linop, V, eigvals, k, mu_min, state, num_syps_passes, num_steps_power_iter_error_est);
 }
 
+/**
+ * TODO: make an overload of rpchol_pc_data that omits "n" and assumes A implements
+ * some linear operator interface.
+ */
+
+template <typename T, typename STATE, typename FUNC>
+STATE rpchol_pc_data(
+    int64_t n, FUNC &A_stateless, int64_t &k, int64_t b, T* V, T* eigvals, STATE state
+) {
+    std::vector<int64_t> selection(k, -1);
+    state = RandLAPACK::rp_cholesky(n, A_stateless, k, selection.data(), V, b, state);
+    // ^ A_stateless \approx VV'; need to convert VV' into its eigendecomposition.
+    std::vector<T> work(k*k, 0.0);
+    lapack::gesdd(lapack::Job::OverwriteVec, n, k, V, n, eigvals, nullptr, 1, work.data(), k);
+    // V has been overwritten with its (nontrivial) left singular vectors
+    for (int64_t i = 0; i < k; ++i) 
+        eigvals[i] = std::pow(eigvals[i], 2);
+    return state;
+}
+
+
+/** 
+ * V is a buffer for an n-by-k matrix in column-major format. 
+ * 
+ * We implicitly have our hands on an n-by-n matrix A = F F' where
+ * F is n-by-k and defined in terms of (V, eigvals, use_eigvals).
+ * If use_eigvals = true, then
+ *      F = V * diag(sqrt(eigvals)), V is column-orthonormal, and
+ *      eigvals contains positive numbers sorted in decreasing order.
+ * Otherwise, 
+ *      F = V and we ignore the values of "eigvals" passed as input.
+ * 
+ * upper_tri is a buffer for an n-by-n upper-triangular matrix in 
+ * column-major format. It implicitly defines a matrix
+ * 
+ *   A_conj = inv(upper_tri)' A inv(upper_tri)
+ * 
+ * This function overwrites (V, eigvals) with the eigenvectors and
+ * eigenvalues of A_conj, where eigenvalues are sorted in decreasing
+ * order.
+ **/
+template <typename T>
+void ut_conjugate_spectral_pc_data(
+    int64_t n, int64_t k, T* V, T* eigvals, const T* upper_tri, std::vector<T> &work, bool use_eigvals
+) {
+    // Step 1: Get our hands on F so that A = FF'.
+    if (use_eigvals) {
+        for (int i = 0; i < k; ++i) {
+            blas::scal(n, (T) std::pow(eigvals[i], (T) 0.5), V + i*n, 1);
+        }
+    }
+    // Step 2: Overwrite F = inv(upper_tri)'F.
+    //         In BLAS terms, we solve trans(upper_tri) X = F, and store X by overwriting F.
+    blas::trsm(blas::Layout::ColMajor, blas::Side::Left, blas::Uplo::Upper, blas::Op::Trans, blas::Diag::NonUnit, n, k, 1.0, upper_tri, n, V, n);
+    // Step 3: Call GESDD: overwrite F with its left
+    //         singular vectors and overwrite eigvals
+    //         with its squared singular values.
+    if (work.size() < k*k)
+        work.resize(k*k);
+    lapack::gesdd(lapack::Job::OverwriteVec, n, k, V, n, eigvals, nullptr, 1, work.size(), k);
+    for (int i = 0; i < k; ++i) {
+        eigvals[i] = std::pow(eigvals[i], 2.0);
+    }
+    return;
+}
 
 }  // end namespace RandLAPACK
diff --git a/RandLAPACK/comps/rl_rpchol.hh b/RandLAPACK/comps/rl_rpchol.hh
new file mode 100644
index 00000000..7ac17f2d
--- /dev/null
+++ b/RandLAPACK/comps/rl_rpchol.hh
@@ -0,0 +1,205 @@
+#pragma once
+
+#include "rl_lapackpp.hh"
+#include <blas.hh>
+#include <RandBLAS.hh>
+#include <algorithm>
+#include <vector>
+#include <set>
+
+namespace RandLAPACK {
+
+namespace _rpchol_impl {
+
+using std::vector;
+using blas::Layout;
+
+template <typename T, typename FUNC_T>
+void compute_columns(
+    Layout layout, int64_t N, FUNC_T &K_stateless, vector<int64_t> &col_indices, T* buff
+) {
+    randblas_require(layout == Layout::ColMajor);
+    int64_t num_cols = col_indices.size();
+    #pragma omp parallel for collapse(2)
+    for (int64_t ell = 0; ell < num_cols; ++ell) {
+        for (int64_t i = 0; i < N; ++i) {
+            int64_t j = col_indices[ell];
+            buff[i + ell*N] = K_stateless(i, j);
+        }
+    }
+    return;
+}
+
+template <typename T>
+void pack_selected_rows(
+    Layout layout, int64_t rows_mat, int64_t cols_mat, T* mat, vector<int64_t> &row_indices, T* submat
+) {
+    randblas_require(layout == Layout::ColMajor);
+    int64_t num_rows = row_indices.size();
+    for (int64_t i = 0; i < num_rows; ++i) {
+        blas::copy(cols_mat, mat + row_indices[i], rows_mat, submat + i, num_rows);
+    }
+    return;
+}
+
+template <typename T>
+int downdate_d_and_cdf(Layout layout, int64_t N, vector<int64_t> &indices, T* F_panel, vector<T> &d, vector<T> &cdf) {
+    randblas_require(layout == Layout::ColMajor);
+    int64_t cols_F_panel = indices.size();
+    for (int64_t j = 0; j < cols_F_panel; ++j) {
+        for (int64_t i = 0; i < N; ++i) {
+            T val = F_panel[i + j*N];
+            d[i] -= val*val;
+        }
+    }
+    // Then, to accound for the possibility of rounding errors, manually zero-out everything in "indices."
+    for (auto i : indices)
+        d[i] = 0.0;
+    cdf = d;
+    try {
+        RandBLAS::weights_to_cdf(N, cdf.data());
+    } catch(RandBLAS::Error &e) {
+        std::string message{e.what()};
+        if (message.find("sum >=") != std::string::npos) {
+            return 1;
+        } else if (message.find("val >= error_if_below") != std::string::npos) {
+            return 2;
+        }
+    }
+    return 0;
+}
+
+} // end namespace RandLAPACK::_rpchol_impl
+
+/***
+ * Computes a rank-k approximation of an implicit n-by-n matrix whose (i,j)^{th}
+ * entry is A_stateless(i,j), where A_stateless is a stateless function. We build
+ * the approximation iteratively and increase the rank by at most "b" at each iteration.
+ * 
+ * Implements Algorithm 4 from https://arxiv.org/abs/2304.12465.
+ * 
+ * Here's example code where the implict matrix is given by a squared exponential kernel:
+ * 
+ *      // Assume we've already defined ...
+ *      //         X  : a rows_x by cols_x double-precision matrix (suitably standardized)
+ *      //              where each column defines a datapoint.
+ *      //  bandwidth : scale for the squared exponential kernel    
+ * 
+ *      auto A = [X, rows_x, cols_x, bandwidth](int64_t i, int64_t j) {
+ *          double out = 0;
+ *          double* Xi = X + i*rows_x;
+ *          double* Xj = X + j*rows_x;
+ *          for (int64_t ell = 0; ell < rows_x) {
+ *              double val = (Xi[ell] - Xj[ell]) / (std::sqrt(2)*bandwidth);
+ *              out += val*val;
+ *          }
+ *          out = std::exp(out);
+ *          return out;
+ *      };
+ *      std::vector<double> F(rows_x*k, 0.0);
+ *      std::vector<int64_t> selection(k);
+ *      RandBLAS::RNGState state_in(0);
+ *      auto state_out = rp_cholesky(cols_x, A, k, selection.data(), F.data(), 64, state_in);
+ * 
+ * Notes
+ * -----
+ * Compare to 
+ * https://github.com/eepperly/Robust-randomized-preconditioning-for-kernel-ridge-regression/blob/main/code/choleskybase.m
+ * 
+ */
+template <typename T, typename FUNC_T, typename STATE, typename CALLBACK>
+STATE rp_cholesky(int64_t n, FUNC_T &A_stateless, int64_t &k, int64_t* S,  T* F, int64_t b, STATE state, CALLBACK &cb) {
+    // TODO: make this function robust to rank-deficient matrices. 
+    using RandBLAS::sample_indices_iid;
+    using RandBLAS::weights_to_cdf;
+    using blas::Op;
+    using blas::Uplo;
+    auto layout = blas::Layout::ColMajor;
+    auto uplo = blas::Uplo::Upper;
+
+    std::vector<T> work_mat(b*k, 0.0);
+    std::vector<T> d(n, 0.0);
+    std::vector<T> cdf(n);
+
+    std::vector<int64_t> Sprime{};
+    
+    for (int64_t i = 0; i < n; ++i)
+        d[i] = A_stateless(i,i);
+    cdf = d;
+    weights_to_cdf(n, cdf.data());
+    int w_status = 0;
+    int64_t ell  = 0;
+    while (ell < k) {
+        if (w_status) {
+            std::cout << "weights_to_cdf failed with exit code " << w_status << ".\n";
+            std::cout << "Returning early, with approximation rank = " << ell << "\n\n";
+            k = ell;
+            cb(k);
+            return state;
+        }
+        //
+        //  1. Compute the next block of column indices
+        //
+        int64_t curr_B = std::min(b, k - ell);
+        Sprime.resize(curr_B);
+        state = sample_indices_iid(n, cdf.data(), curr_B, Sprime.data(), state);
+        std::sort( Sprime.begin(), Sprime.end() );
+        Sprime.erase( unique( Sprime.begin(), Sprime.end() ), Sprime.end() );
+        int64_t ell_incr = Sprime.size();
+
+        //
+        //  2. Compute F_panel: the next block of ell_incr columns in F.
+        //
+        T* F_panel = F + ell*n;
+        //
+        //      2.1. Overwrite F_panel with the matrix "G" from Line 5 of [arXiv:2304.12465, Algorithm 4].
+        //
+        //           First we compute a submatrix of columns of A and then we downdate with GEMM.
+        //           The downdate is delicate since the output matrix shares a buffer with one of the
+        //           input matrices, but it's okay since they're non-overlapping regions of that buffer.
+        //
+        _rpchol_impl::compute_columns(layout, n, A_stateless, Sprime, F_panel);
+        //           ^ F_panel = A(:, Sprime).
+        _rpchol_impl::pack_selected_rows(layout, n, ell, F, Sprime, work_mat.data());
+        //           ^ work_mat is a copy of F(Sprime, 1:ell).
+        blas::gemm(
+            layout, Op::NoTrans, Op::Trans, n, ell_incr, ell,
+            -1.0, F, n, work_mat.data(), ell_incr, 1.0, F_panel, n
+        );
+        //
+        //      2.2. Execute Lines 6 and 7 of [arXiv:2304.12465, Algorithm 4].     
+        //
+        _rpchol_impl::pack_selected_rows(layout, n, ell_incr, F_panel, Sprime, work_mat.data());
+        int c_status = lapack::potrf(uplo, ell_incr, work_mat.data(), ell_incr);
+        if (c_status) {
+            std::cout << "Cholesky failed with exit code " << c_status << ".\n";
+            std::cout << "Returning early, with approximation rank = " << ell << "\n\n";
+            k = ell;
+            cb(k);
+            return state;
+        }
+        blas::trsm(
+            layout, blas::Side::Right, uplo, Op::NoTrans, blas::Diag::NonUnit,
+            n, ell_incr, 1.0, work_mat.data(), ell_incr, F_panel, n
+        );
+
+        //
+        // 3. Update S, d, cdf and ell.
+        //
+        std::copy(Sprime.begin(), Sprime.end(), S + ell);
+        w_status = _rpchol_impl::downdate_d_and_cdf(layout, n, Sprime, F_panel, d, cdf);
+        ell = ell + ell_incr;
+    }
+    cb(k);
+    return state;
+}
+
+template <typename T, typename FUNC_T, typename STATE>
+STATE rp_cholesky(int64_t n, FUNC_T &A_stateless, int64_t &k, int64_t* S,  T* F, int64_t b, STATE state) {
+    auto cb = [](int64_t i) { return i ;};
+    rp_cholesky(n, A_stateless, k, S, F, b, state, cb);
+    return state;
+}
+
+
+}
diff --git a/RandLAPACK/comps/rl_rs.hh b/RandLAPACK/comps/rl_rs.hh
index 978be5c5..c4437057 100644
--- a/RandLAPACK/comps/rl_rs.hh
+++ b/RandLAPACK/comps/rl_rs.hh
@@ -132,11 +132,11 @@ int RS<T, RNG>::call(
     if (p % 2 == 0) {
         // Fill n by k Omega
         RandBLAS::DenseDist D(n, k);
-        state = RandBLAS::fill_dense(D, Omega, state).second;
+        state = RandBLAS::fill_dense(D, Omega, state);
     } else {
         // Fill m by k Omega_1
         RandBLAS::DenseDist D(m, k);
-        state = RandBLAS::fill_dense(D, Omega_1, state).second;
+        state = RandBLAS::fill_dense(D, Omega_1, state);
 
         // multiply A' by Omega results in n by k omega
         blas::gemm(Layout::ColMajor, Op::Trans, Op::NoTrans, n, k, m, 1.0, A, m, Omega_1, m, 0.0, Omega, n);
diff --git a/RandLAPACK/comps/rl_syps.hh b/RandLAPACK/comps/rl_syps.hh
index e1d42a2a..e071bf9e 100644
--- a/RandLAPACK/comps/rl_syps.hh
+++ b/RandLAPACK/comps/rl_syps.hh
@@ -31,7 +31,7 @@ class SymmetricPowerSketch {
         ) = 0;
 
         virtual int call(
-            SymmetricLinearOperator<T> &A,
+            linops::SymmetricLinearOperator<T> &A,
             int64_t k,
             RandBLAS::RNGState<RNG> &state,
             T* &skop_buff = nullptr,
@@ -108,7 +108,7 @@ class SYPS : public SymmetricPowerSketch<T, RNG> {
         );
 
         int call(
-            SymmetricLinearOperator<T> &A,
+            linops::SymmetricLinearOperator<T> &A,
             int64_t k,
             RandBLAS::RNGState<RNG> &state,
             T* &skop_buff,
@@ -125,7 +125,7 @@ class SYPS : public SymmetricPowerSketch<T, RNG> {
 // -----------------------------------------------------------------------------
 template <typename T, typename RNG>
 int SYPS<T, RNG>::call(
-    SymmetricLinearOperator<T> &A,
+    linops::SymmetricLinearOperator<T> &A,
     int64_t k,
     RandBLAS::RNGState<RNG> &state,
     T* &skop_buff,
@@ -140,12 +140,12 @@ int SYPS<T, RNG>::call(
      if (!callers_skop_buff)
          skop_buff = new T[m * k];
     RandBLAS::DenseDist D(m, k);
-    state = RandBLAS::fill_dense(D, skop_buff, state).second;
+    state = RandBLAS::fill_dense(D, skop_buff, state);
 
      bool callers_work_buff = work_buff != nullptr;
      if (!callers_work_buff)
          work_buff = new T[m * k];
-    RandBLAS::util::safe_scal(m * k, 0.0, work_buff, 1);
+    RandBLAS::util::safe_scal(m * k, (T) 0.0, work_buff, 1);
 
     T *symm_out = work_buff;
     T *symm_in  = skop_buff;
@@ -166,8 +166,6 @@ int SYPS<T, RNG>::call(
     if (p % 2 == 1)
         blas::copy(m * k, work_buff, 1, skop_buff, 1);
 
-    RandBLAS::DenseSkOp<T>(D, state, skop_buff);
-
     if (!callers_work_buff)
         delete[] work_buff;
 
@@ -186,7 +184,7 @@ int SYPS<T, RNG>::call(
     T* &skop_buff,
     T* work_buff
 ) {
-    ExplicitSymLinOp<T> A_linop(m, uplo, A, lda, Layout::ColMajor);
+    linops::ExplicitSymLinOp<T> A_linop(m, uplo, A, lda, Layout::ColMajor);
     return call(A_linop, k, state, skop_buff, work_buff);
 }
 
diff --git a/RandLAPACK/comps/rl_syrf.hh b/RandLAPACK/comps/rl_syrf.hh
index ab40d79e..246dd2de 100644
--- a/RandLAPACK/comps/rl_syrf.hh
+++ b/RandLAPACK/comps/rl_syrf.hh
@@ -22,7 +22,7 @@ class SymmetricRangeFinder {
         virtual ~SymmetricRangeFinder() {}
 
         virtual int call(
-            SymmetricLinearOperator<T> &A,
+            linops::SymmetricLinearOperator<T> &A,
             int64_t k,
             std::vector<T> &Q,
             RandBLAS::RNGState<RNG> &state,
@@ -94,7 +94,7 @@ class SYRF : public SymmetricRangeFinder<T, RNG> {
         ) override;
 
         int call(
-            SymmetricLinearOperator<T> &A,
+            linops::SymmetricLinearOperator<T> &A,
             int64_t k,
             std::vector<T> &Q,
             RandBLAS::RNGState<RNG> &state,
@@ -116,7 +116,7 @@ class SYRF : public SymmetricRangeFinder<T, RNG> {
 // -----------------------------------------------------------------------------
 template <typename T, typename RNG>
 int SYRF<T, RNG>::call(
-    SymmetricLinearOperator<T> &A,
+    linops::SymmetricLinearOperator<T> &A,
     int64_t k,
     std::vector<T> &Q,
     RandBLAS::RNGState<RNG> &state,
@@ -127,13 +127,13 @@ int SYRF<T, RNG>::call(
     if (!callers_work_buff)
         work_buff = new T[m * k];
 
-    RandBLAS::util::safe_scal(m * k, 0.0, work_buff, 1);
+    RandBLAS::util::safe_scal(m * k, (T) 0.0, work_buff, 1);
 
     T* Q_dat = util::upsize(m * k, Q);
     SYPS_Obj.call(A, k, state, work_buff, Q_dat);
 
     // Q = orth(A * Omega)
-    A(Layout::ColMajor, k, 1.0, work_buff, m, 0.0, Q_dat, m);
+    A(Layout::ColMajor, k, (T) 1.0, work_buff, m, (T) 0.0, Q_dat, m);
     if(this->cond_check) {
         util::upsize(m * k, this->cond_work_mat);
         util::upsize(k, this->cond_work_vec);
@@ -161,7 +161,7 @@ int SYRF<T, RNG>::call(
     RandBLAS::RNGState<RNG> &state,
     T* work_buff
 ) {
-    ExplicitSymLinOp<T> A_linop(m, uplo, A, m, Layout::ColMajor);
+    linops::ExplicitSymLinOp<T> A_linop(m, uplo, A, m, Layout::ColMajor);
     return this->call(A_linop, k, Q, state, work_buff);
 }
 
diff --git a/RandLAPACK/drivers/rl_cqrrp.hh b/RandLAPACK/drivers/rl_cqrrp.hh
index bfdf3d18..8efcba40 100644
--- a/RandLAPACK/drivers/rl_cqrrp.hh
+++ b/RandLAPACK/drivers/rl_cqrrp.hh
@@ -298,7 +298,7 @@ int CQRRP_blocked<T, RNG>::call(
     // as LU is not intended to be used with rank-deficient matrices.
     T* S  = ( T * ) calloc( d * m, sizeof( T ) );
     RandBLAS::DenseDist D(d, m);
-    state = RandBLAS::fill_dense(D, S, state).second;
+    state = RandBLAS::fill_dense(D, S, state);
     blas::gemm(Layout::ColMajor, Op::NoTrans, Op::NoTrans, d, n, m, 1.0, S, d, A, m, 0.0, A_sk, d);
     free(S);
 
diff --git a/RandLAPACK/drivers/rl_cqrrpt.hh b/RandLAPACK/drivers/rl_cqrrpt.hh
index 360a8143..fed8038b 100644
--- a/RandLAPACK/drivers/rl_cqrrpt.hh
+++ b/RandLAPACK/drivers/rl_cqrrpt.hh
@@ -196,9 +196,10 @@ int CQRRPT<T, RNG>::call(
         saso_t_start = high_resolution_clock::now();
     
     /// Generating a SASO
-    RandBLAS::SparseDist DS = {.n_rows = d, .n_cols = m, .vec_nnz = this->nnz};
+    RandBLAS::SparseDist DS(d, m, this->nnz);
     RandBLAS::SparseSkOp<T, RNG> S(DS, state);
-    state = RandBLAS::fill_sparse(S);
+    RandBLAS::fill_sparse(S);
+    state = S.next_state;
 
     /// Applying a SASO
     RandBLAS::sketch_general(
diff --git a/RandLAPACK/drivers/rl_hqrrp.hh b/RandLAPACK/drivers/rl_hqrrp.hh
index b84a34f2..d565cd9d 100644
--- a/RandLAPACK/drivers/rl_hqrrp.hh
+++ b/RandLAPACK/drivers/rl_hqrrp.hh
@@ -345,18 +345,16 @@ int64_t NoFLA_Apply_Q_WY_lhfc_blk_var4(
 // ============================================================================
 template <typename T>
 int64_t NoFLA_QRP_compute_norms(
-    int64_t m_A, int64_t n_A, T * buff_A, int64_t ldim_A,
-    T * buff_d, T * buff_e ) {
+    int64_t m_A, int64_t n_A, T * buff_A, int64_t ldim_A, T * buff_d, T * buff_e
+) {
     //
     // It computes the column norms of matrix A. The norms are stored int64_to 
     // vectors d and e.
     //
 
-    int64_t     j, i_one = 1;
     // Main loop.
-    //#pragma omp parallel for
-    for( j = 0; j < n_A; j++ ) {
-        * buff_d = blas::nrm2(m_A, buff_A, i_one);
+    for(int64_t j = 0; j < n_A; j++ ) {
+        * buff_d = blas::nrm2(m_A, buff_A, 1);
         * buff_e = * buff_d;
         buff_A += ldim_A;
         buff_d++;
@@ -560,8 +558,8 @@ static int64_t CHOLQR_mod_WY(
     // Entries of tau will be placed on the main diagonal of matrix T from orhr_col().
     for(i = 0; i < n_A; ++i)
         buff_t[i] = buff_T[(ldim_T + 1) * i];
-    #endif
     return 0;
+    #endif
 }
 
 // ============================================================================
@@ -934,8 +932,8 @@ int64_t hqrrp(
     }
 
     // Initialize matrices G and Y.
-    RandBLAS::DenseDist D(nb_alg + pp, m_A, RandBLAS::DenseDistName::Uniform);
-    state = RandBLAS::fill_dense(D, buff_G, state).second;
+    RandBLAS::DenseDist D(nb_alg + pp, m_A, RandBLAS::ScalarDist::Uniform);
+    state = RandBLAS::fill_dense(D, buff_G, state);
     
     blas::gemm(Layout::ColMajor,
                 Op::NoTrans, Op::NoTrans, m_Y, n_Y, m_A, 
diff --git a/RandLAPACK/drivers/rl_krillx.hh b/RandLAPACK/drivers/rl_krillx.hh
new file mode 100644
index 00000000..9e0d2211
--- /dev/null
+++ b/RandLAPACK/drivers/rl_krillx.hh
@@ -0,0 +1,162 @@
+#pragma once
+
+#include "rl_blaspp.hh"
+#include "rl_lapackpp.hh"
+#include "rl_linops.hh"
+#include "rl_preconditioners.hh"
+#include "rl_rpchol.hh"
+#include "rl_pdkernels.hh"
+#include "rl_determiter.hh"
+
+#include <RandBLAS.hh>
+#include <limits>
+#include <vector>
+
+/**
+ * 
+ * TODO:
+ *  (1) finish and test krill_restricted_rpchol
+ *  (2) write and test a krill_restricted function that accepts the centers as inputs
+ *      in advance.
+ *  (3) See also, rl_preconditioners.hh
+ * 
+ */
+
+namespace RandLAPACK {
+
+/**
+ * Fun thing about the name KRILLx:
+ * 
+ *      we can do KRILLrs for KRILL with lockstep PCG for regularization sweep.
+ * 
+ *      we can do KRILLb (?) for "random lifting + block" version.
+ */
+
+using std::vector;
+
+template <typename T, typename FUNC, typename SEMINORM, typename STATE>
+STATE krill_full_rpchol(
+    int64_t n, FUNC &G, vector<T> &H, vector<T> &X, T tol,
+    STATE state, SEMINORM seminorm, int64_t rpchol_block_size = -1, int64_t max_iters = 20, int64_t k = -1
+) {
+    auto mus = G.regs;
+    int64_t ell = ((int64_t) H.size()) / n;
+    randblas_require(ell * n == (int64_t) H.size());
+    int64_t mu_size = mus.size();
+    randblas_require(mu_size == 1 || mu_size == ell);
+
+    if (rpchol_block_size < 0)
+        rpchol_block_size = std::min((int64_t) 64, n/4);
+    if (k < 0)
+        k = (int64_t) std::sqrt(n);
+    
+    vector<T> V(n*k, 0.0);
+    vector<T> eigvals(k, 0.0);
+    G.set_eval_includes_reg(false);
+    state = rpchol_pc_data(n, G, k, rpchol_block_size, V.data(), eigvals.data(), state);
+    linops::SpectralPrecond<T> invP(n);
+    invP.prep(V, eigvals, mus, ell);
+    G.set_eval_includes_reg(true);
+    lockorblock_pcg(G, H, tol, max_iters, invP, seminorm, X, true);
+
+    return state;
+}
+
+/**
+ * We start with a regularized kernel linear operator G and target data H.
+ * We use "K" to denote the unregularized version of G, which can be accessed
+ * by calling G.set_eval_includes_reg(false);
+ * 
+ * If G.regs.size() == 1, then the nominal KRR problem reduces to computing
+ * 
+ *     (K + G.regs[0] * I) X = H.       (*)
+ * 
+ * If G.regs.size() > 1, then KRR is nominally about solving the independent
+ * collection of problems
+ * 
+ *      (K + mu_i * I) x_i = h_i,       (**)
+ * 
+ * where K is the unregularized version of G, mu_i = G.regs[i], and x_i, h_i
+ * are the i-th columns of X and H respectively. In this situation we need
+ * H to have exactly G.regs.size() columns.
+ *      
+ * This function produces __approximate__ solutions to KRR problems. It does so
+ * by finding a set of indices for which
+ * 
+ *      K_hat = K(:,inds) * inv(K(inds, inds)) * K(inds, :) 
+ * 
+ * is a good low-rank approximation of K. We spend O(n*k^2) arithmetic operations and
+ * O(n*k_ evaluations of K(i,j) to get our hands on "inds" and a factored representation
+ * of K_hat.
+ * 
+ * Given inds, we turn our attention to solving the problem
+ * 
+ *      min{ || K(:,inds) x - H ||_2^2 + mu || sqrtm(K(inds, inds)) x ||_2^2 : x  }.
+ *      
+ * We don't store K(:,inds) explicitly. Instead, we have access to a matrix V where
+ * 
+ *      (i)   K_hat = VV',
+ *      (ii)  V(inds,:)V(inds,:)' = K(inds, inds), and
+ *      (iii) V*V(inds,:)' = K_hat(:,inds) = K(:, inds).
+ * 
+ * If we abbreviate M := V(inds, :), then the restricted KRR problem can be framed as 
+ * 
+ *      min{ || V M' x - H ||_2^2 + mu || M' X ||_2^2  :  x  }.
+ * 
+ * We approach this by a change of basis, solving problems like
+ * 
+ *      min{ ||V y - H||_2^2 + mus || y ||_2^2 : y }        (***)
+ * 
+ *  and then returning x = inv(M') y.
+ * 
+ * Note that since we spend O(n*k^2) time getting our hands on V and inds, it would be
+ * reasonable to spend O(n*k^2) additional time to solve (***) by a direct method.
+ * However, it is easy enough to reduce the cost of solving (***) to o(n*k^2)
+ * (that is, little-o of n*k^2) by a sketch and precondition approach. 
+ *
+ */
+template <typename T, typename FUNC, typename SEMINORM, typename STATE>
+STATE krill_restricted_rpchol(
+    int64_t n, FUNC &G, vector<T> &H, vector<T> &X, T tol,
+    STATE state, SEMINORM seminorm, int64_t rpchol_block_size = -1, int64_t max_iters = 20, int64_t k = -1
+) {
+    // NOTE: on entry, X is n-by-s for some integer s. That's way bigger than it needs to be, since the
+    // solution we return can be written down with k*s nonzeros plus k indices to indicate which rows of X
+    // are nonzero.
+    vector<T> V(n*k, 0.0);
+    vector<T> eigvals(k, 0.0);
+    G.set_eval_includes_reg(false);
+
+    vector<int64_t> inds(k, -1);
+    state = rp_cholesky(n, G, k, inds.data(), V.data(), rpchol_block_size, state);
+    inds.resize(k);
+    // ^ VV' defines a rank-k Nystrom approximation of G. The approximation satisfies
+    //
+    //          VV' = G(:,inds) * inv(G(inds, inds)) * G(inds, :) 
+    //   and
+    //          (VV')(inds, inds) = G(inds, inds).
+    //
+    //   That second identity can be written as MM' = G(inds, inds) for M = V(inds, :).
+    //
+
+
+    vector<T> M(k * k);
+    _rpchol_impl::pack_selected_rows(blas::Layout::ColMajor, n, k, V.data(), inds, M.data());
+    //
+    //
+    //
+
+    linops::SpectralPrecond<T> invP(n);
+    // invP.prep(V, eigvals, mus, ell);
+    return state;
+}
+
+// template <typename T, typename FUNC, typename STATE>
+// STATE krill_block(
+//
+// ) {
+//
+// }
+
+
+} // end namespace RandLAPACK
diff --git a/RandLAPACK/drivers/rl_rbki.hh b/RandLAPACK/drivers/rl_rbki.hh
index 9c227422..85cf6480 100644
--- a/RandLAPACK/drivers/rl_rbki.hh
+++ b/RandLAPACK/drivers/rl_rbki.hh
@@ -248,7 +248,7 @@ int RBKI<T, RNG>::call(
     omp_set_num_threads(this->num_threads_some);
 #endif
     RandBLAS::DenseDist D(n, k);
-    state = RandBLAS::fill_dense(D, Y_i, state).second;
+    state = RandBLAS::fill_dense(D, Y_i, state);
 #if RandLAPACK_HAS_OpenMP
     omp_set_num_threads(this->num_threads_rest);
 #endif
diff --git a/RandLAPACK/drivers/rl_revd2.hh b/RandLAPACK/drivers/rl_revd2.hh
index e5eee65f..bf6d49e9 100644
--- a/RandLAPACK/drivers/rl_revd2.hh
+++ b/RandLAPACK/drivers/rl_revd2.hh
@@ -31,7 +31,7 @@ class REVD2alg {
         ) = 0;
 
         virtual int call(
-            SymmetricLinearOperator<T> &A,
+            linops::SymmetricLinearOperator<T> &A,
             int64_t &k,
             T tol,
             std::vector<T> &V,
@@ -98,7 +98,7 @@ class REVD2 : public REVD2alg<T, RNG> {
         ) override;
 
         int call(
-            SymmetricLinearOperator<T> &A,
+            linops::SymmetricLinearOperator<T> &A,
             int64_t &k,
             T tol,
             std::vector<T> &V,
@@ -127,7 +127,7 @@ class REVD2 : public REVD2alg<T, RNG> {
 /// All other parameters come from REVD2
 template <typename T>
 T power_error_est(
-    SymmetricLinearOperator<T> &A,
+    linops::SymmetricLinearOperator<T> &A,
     int64_t k,
     int p,
     T* vector_buf,
@@ -176,7 +176,7 @@ T power_error_est(
 
 template <typename T, typename RNG>
 int REVD2<T, RNG>::call(
-        SymmetricLinearOperator<T> &A,
+        linops::SymmetricLinearOperator<T> &A,
         int64_t &k,
         T tol,
         std::vector<T> &V,
@@ -252,8 +252,8 @@ int REVD2<T, RNG>::call(
         // Using the first column of Omega as a buffer for a random vector
         // To perform the following safely, need to make sure Omega has at least 4 columns
         Omega_dat = util::upsize(m * 4, this->Omega);
-        RandBLAS::DenseDist  g(m, 1);
-        error_est_state = RandBLAS::fill_dense(g, Omega_dat, error_est_state).second;
+        RandBLAS::DenseDist g(m, 1);
+        error_est_state = RandBLAS::fill_dense(g, Omega_dat, error_est_state);
 
         err = power_error_est(A, k, this->error_est_p, Omega_dat, V_dat, Y_dat, eigvals.data()); 
 
@@ -279,7 +279,7 @@ int REVD2<T, RNG>::call(
         std::vector<T> &eigvals,
         RandBLAS::RNGState<RNG> &state
 ) {
-    ExplicitSymLinOp<T> A_linop(m, uplo, A, m, Layout::ColMajor);
+    linops::ExplicitSymLinOp<T> A_linop(m, uplo, A, m, Layout::ColMajor);
     return this->call(A_linop, k, tol, V, eigvals, state);
 }
 
diff --git a/RandLAPACK/misc/rl_gen.hh b/RandLAPACK/misc/rl_gen.hh
index 1068ffad..bdc2f4a4 100644
--- a/RandLAPACK/misc/rl_gen.hh
+++ b/RandLAPACK/misc/rl_gen.hh
@@ -30,35 +30,31 @@ enum mat_type {
 
 /// A struct containing info about a given matrix to be generated by mat_gen().
 /// Requires only the size and type of a matrix by default, but can have other optional parameters.
+///
+/// We set defaults in the member declaration in case people try to struct-initialize this type.
+///
 template <typename T>
 struct mat_gen_info {
     int64_t rows;
     int64_t cols;
     int64_t rank;
     mat_type m_type;
-    T cond_num;
-    T scaling;
-    T exponent;
-    bool diag;
-    bool check_true_rank;
-    T theta;
-    T perturb;
+    T cond_num = 1.0;
+    T scaling = 1.0;
+    T exponent = 1.0;
+    bool diag = false;
+    bool check_true_rank = false;
+    T theta = 1.0;
+    T perturb = 1.0;
     char* filename;
     int workspace_query_mod;
+    T frac_spectrum_one = 0.1;
 
     mat_gen_info(int64_t& m, int64_t& n, mat_type t) {
         rows = m;
         cols = n;
         m_type = t;
-        /// default values
-        diag = false;
-        rank = n;
-        cond_num = 1.0;
-        scaling = 1.0;
-        exponent = 1.0;
-        theta = 1.0;
-        perturb = 1.0;
-        check_true_rank = false;
+        rank = n; // <-- default value.
     }
 };
 
@@ -80,8 +76,8 @@ void gen_singvec(
 
     RandBLAS::DenseDist DU(m, k);
     RandBLAS::DenseDist DV(n, k);
-    state = RandBLAS::fill_dense(DU, U, state).second;
-    state = RandBLAS::fill_dense(DV, V, state).second;
+    state = RandBLAS::fill_dense(DU, U, state);
+    state = RandBLAS::fill_dense(DV, V, state);
 
     blas::copy(k, S, k + 1, A, m + 1);
 
@@ -98,8 +94,8 @@ void gen_singvec(
 
 /// Generates a matrix with polynomially-decaying spectrum of the following form:
 /// s_i = a(i + b)^p, where p is the user-defined exponent constant, a and b are computed
-/// using p and the user-defined condition number parameter and the first 10 percent of the 
-/// singular values are equal to one.
+/// using p and the user-defined condition number parameter and the first 
+/// (100 * frac_spectrum_one) percent of the  singular values are equal to one.
 /// User can optionally choose for the matrix to be diagonal.
 /// The output matrix has k singular values. 
 template <typename T, typename RNG>
@@ -108,6 +104,7 @@ void gen_poly_mat(
     int64_t &n,
     T* A,
     int64_t k,
+    T frac_spectrum_one,
     T cond,
     T p,
     bool diagon,
@@ -119,11 +116,12 @@ void gen_poly_mat(
     T* S = ( T * ) calloc( k * k, sizeof( T ) );
 
     // The first 10% of the singular values will be equal to one
-    int offset = (int) floor(k * 0.1);
+    int offset = (int) floor(k * frac_spectrum_one);
     T first_entry = 1.0;
     T last_entry = first_entry / cond;
-    T a = std::pow((std::pow(last_entry, -1 / p) - std::pow(first_entry, -1 / p)) / (k - offset), p);
-    T b = std::pow(a * first_entry, -1 / p) - offset;
+    T neg_invp = -((T)1.0)/p;
+    T a = std::pow((std::pow(last_entry, neg_invp) - std::pow(first_entry, neg_invp)) / (k - offset), p);
+    T b = std::pow(a * first_entry, neg_invp) - offset;
     // apply lambda function to every entry of s
     std::fill(s, s + offset, 1.0);
     for (int i = offset; i < k; ++i) {
@@ -243,16 +241,15 @@ void gen_spiked_mat(
 ) {
     int64_t num_rows_sampled = n / 2;
 
-    /// sample from [m] without replacement. Get the row indices for a tall LASO with a single column.
-    RandBLAS::SparseDist DS = {.n_rows = m, .n_cols = 1, .vec_nnz = num_rows_sampled, .major_axis = RandBLAS::MajorAxis::Long};
-    RandBLAS::SparseSkOp<T> S(DS, state);
-    state = RandBLAS::fill_sparse(S);
+    /// sample from [m] without replacement
+    int64_t* rows = new int64_t[num_rows_sampled]{};
+    state = RandBLAS::repeated_fisher_yates(num_rows_sampled, m, 1, rows, state);
     
     T* V   = ( T * ) calloc( n * n, sizeof( T ) );
     T* tau = ( T * ) calloc( n,     sizeof( T ) );
 
     RandBLAS::DenseDist DV(n, n);
-    state = RandBLAS::fill_dense(DV, V, state).second;
+    state = RandBLAS::fill_dense(DV, V, state);
 
     lapack::geqrf(n, n, V, n, tau);
     lapack::ungqr(n, n, n, V, n, tau);
@@ -269,11 +266,12 @@ void gen_spiked_mat(
 
     for (i = 0; i < n; ++ i) {
         for (j = 0; j < num_rows_sampled; ++j) {
-            A[m * i + S.rows[j]] *= spike_scale;
+            A[m * i + rows[j]] *= spike_scale;
         }
         j = 0;
     }
 
+    delete [] rows;
     free(V);
     free(tau);
 }
@@ -304,10 +302,10 @@ void gen_oleg_adversarial_mat(
     T* tau2 = ( T * ) calloc( n,     sizeof( T ) );
 
     RandBLAS::DenseDist DU(m, n);
-    state = RandBLAS::fill_dense(DU, U, state).second;
+    state = RandBLAS::fill_dense(DU, U, state);
 
     RandBLAS::DenseDist DV(n, n);
-    state = RandBLAS::fill_dense(DV, V, state).second;
+    state = RandBLAS::fill_dense(DV, V, state);
 
     for(int i = 0; i < n; ++i) {
         //U_dat[m * i + 1] *= scaling_factor_U;
@@ -470,7 +468,7 @@ void mat_gen(
     switch(info.m_type) {
         case polynomial:
                 // Generating matrix with polynomially decaying singular values
-                RandLAPACK::gen::gen_poly_mat(info.rows, info.cols, A, info.rank, info.cond_num, info.exponent, info.diag, state);
+                RandLAPACK::gen::gen_poly_mat(info.rows, info.cols, A, info.rank, info.frac_spectrum_one, info.cond_num, info.exponent, info.diag, state);
                 break;
         case exponential:
                 // Generating matrix with exponentially decaying singular values
@@ -480,7 +478,7 @@ void mat_gen(
         case gaussian: {
                 // Gaussian random matrix
                 RandBLAS::DenseDist D(info.rows, info.cols);
-                state = RandBLAS::fill_dense(D, A, state).second;
+                state = RandBLAS::fill_dense(D, A, state);
             }
             break;
         case step: {
@@ -522,4 +520,12 @@ void mat_gen(
             break;
     }
 }
+
+template <typename T, typename RNG>
+std::vector<T> mat_gen(mat_gen_info<T> &info, RandBLAS::RNGState<RNG> &state) {
+    std::vector<T> A(info.rows * info.cols, 0.0);
+    mat_gen(info, A.data(), state);
+    return A;
+}
+
 }
diff --git a/RandLAPACK/misc/rl_linops.hh b/RandLAPACK/misc/rl_linops.hh
index e5a733df..767d5357 100644
--- a/RandLAPACK/misc/rl_linops.hh
+++ b/RandLAPACK/misc/rl_linops.hh
@@ -11,7 +11,9 @@
 #include <vector>
 #include <cstdint>
 
-namespace RandLAPACK {
+namespace RandLAPACK::linops {
+
+using std::vector;
 
 template <typename T>
 struct SymmetricLinearOperator {
@@ -41,31 +43,33 @@ struct SymmetricLinearOperator {
         int64_t ldc
     ) = 0;
 
+    virtual T operator()(int64_t i, int64_t j) = 0;
+ 
     virtual ~SymmetricLinearOperator() {}
 };
 
 template <typename T>
 struct ExplicitSymLinOp : public SymmetricLinearOperator<T> {
 
-    const Uplo uplo;
+    const blas::Uplo uplo;
     const T* A_buff;
     const int64_t lda;
-    const Layout buff_layout;
+    const blas::Layout buff_layout;
 
     ExplicitSymLinOp(
         int64_t m,
-        Uplo uplo,
+        blas::Uplo uplo,
         const T* A_buff,
         int64_t lda,
-        Layout buff_layout
-    ) : SymmetricLinearOperator<T>(m), uplo(uplo), A_buff(A_buff), lda(lda), buff_layout(buff_layout) {};
+        blas::Layout buff_layout
+    ) : SymmetricLinearOperator<T>(m), uplo(uplo), A_buff(A_buff), lda(lda), buff_layout(buff_layout) {}
 
     // Note: the "layout" parameter here is interpreted for (B and C).
     // If layout conflicts with this->buff_layout then we manipulate
     // parameters to blas::symm to reconcile the different layouts of
     // A vs (B, C).
     void operator()(
-        Layout layout,
+        blas::Layout layout,
         int64_t n,
         T alpha,
         T* const B,
@@ -78,15 +82,241 @@ struct ExplicitSymLinOp : public SymmetricLinearOperator<T> {
         randblas_require(ldc >= this->m);
         auto blas_call_uplo = this->uplo;
         if (layout != this->buff_layout)
-            blas_call_uplo = (this->uplo == Uplo::Upper) ? Uplo::Lower : Uplo::Upper;
+            blas_call_uplo = (this->uplo == blas::Uplo::Upper) ? blas::Uplo::Lower : blas::Uplo::Upper;
         // Reading the "blas_call_uplo" triangle of "this->A_buff" in "layout" order is the same
         // as reading the "this->uplo" triangle of "this->A_buff" in "this->buff_layout" order.
         blas::symm(
             layout, Side::Left, blas_call_uplo, this->m, n, alpha,
             this->A_buff, this->lda, B, ldb, beta, C, ldc
         );
-    };
+    }
+
+    inline T operator()(int64_t i, int64_t j) {
+        randblas_require(this->uplo == blas::Uplo::Upper && this->buff_layout == blas::Layout::ColMajor);
+        if (i > j) {
+            return A_buff[j + i*lda];
+        } else {
+            return A_buff[i + j*lda];
+        }
+    }
+};
+
+template <typename T>
+struct RegExplicitSymLinOp : public SymmetricLinearOperator<T> {
+
+    const T* A_buff;
+    const int64_t lda;
+    vector<T> regs;
+    bool      _eval_includes_reg;
+
+    static const blas::Uplo uplo = blas::Uplo::Upper;
+    static const blas::Layout buff_layout = blas::Layout::ColMajor;
+    using scalar_t = T;
+
+    RegExplicitSymLinOp(
+        int64_t m, const T* A_buff, int64_t lda, vector<T> &regs
+    ) : SymmetricLinearOperator<T>(m), A_buff(A_buff), lda(lda), regs(regs) {
+        randblas_require(lda >= m);
+        _eval_includes_reg = false;
+    }
+
+    void set_eval_includes_reg(bool eir) {
+        _eval_includes_reg = eir;
+    }
+
+    void operator()(blas::Layout layout, int64_t n, T alpha, T* const B, int64_t ldb, T beta, T* C, int64_t ldc) {
+        randblas_require(layout == this->buff_layout);
+        randblas_require(ldb >= this->m);
+        randblas_require(ldc >= this->m);
+        blas::symm(layout, blas::Side::Left, this->uplo, this->m, n, alpha, this->A_buff, this->lda, B, ldb, beta, C, ldc);
+
+        if (_eval_includes_reg) {
+            int64_t num_regs = this->regs.size();
+            if (num_regs != 1)
+                randblas_require(n == num_regs);
+            T* regsp = regs.data();
+            for (int64_t i = 0; i < n; ++i) {
+                T coeff =  alpha * regsp[std::min(i, num_regs - 1)];
+                blas::axpy(this->m, coeff, B + i*ldb, 1, C +  i*ldc, 1);
+            }
+        }
+        return;
+    }
+
+    inline T operator()(int64_t i, int64_t j) {
+        T val;
+        if (i > j) {
+            val = A_buff[j + i*lda];
+        } else {
+            val = A_buff[i + j*lda];
+        }
+        if (_eval_includes_reg) {
+            randblas_require(regs.size() == 1);
+            val += regs[0];
+        }
+        return val;
+    }
+
+};
+
+template<typename T>
+struct SpectralPrecond {
+
+    public:
+    using scalar_t = T; 
+    const int64_t m;
+    int64_t k;
+    int64_t s;
+    vector<T> V;
+    T* V_ptr;
+    vector<T> D;
+    T* D_ptr;
+    vector<T> work;
+    T* work_ptr;
+    int64_t num_regs = 1;
+
+    /* Suppose we want to precondition a positive semidefinite matrix G_mu = G + mu*I.
+     *
+     * Once properly preparred, this preconditioner represents a linear operator of the form
+     *      P = V diag(D) V' + I.
+     * The columns of V approximate the top k eigenvectors of G, while the 
+     * entries of D are *functions of* the corresponding approximate eigenvalues.
+     * 
+     * The specific form of the entries of D are as follows. Suppose we start with
+     * (V, lambda) as approximations of the top k eigenpairs of G, and define the vector
+     *      D0 = (min(lambda) + mu) / (lambda + mu).
+     * From a mathematical perspective, this preconditioner represents the linear operator
+     *      P = V diag(D0) V' + (I - VV').
+     * The action of this linear operator can be computed with two calls to GEMM
+     * instead of three if we store D = D0 - 1 instead of D0 itself.
+     */
+
+    SpectralPrecond(int64_t m)
+        : m(m), k(1), s(1),
+          V(k * m), V_ptr(V.data()),
+          D(k), D_ptr(D.data()),
+          work(k * s), work_ptr(work.data()) {}
+
+    // Move constructor
+    // Call as SpectralPrecond<T> spc(std::move(other)) when we want to transfer the
+    // contents of "other" to "this". 
+    SpectralPrecond(SpectralPrecond &&other) noexcept
+        : m(other.m), k(other.k), s(other.s),
+          V(std::move(other.V)), V_ptr(V.data()),
+          D(std::move(other.D)), D_ptr(D.data()),
+          work(std::move(other.work)), work_ptr(work.data()),
+          num_regs(other.num_regs) {}
+
+    // Copy constructor
+    // Call as SpectralPrecond<T> spc(other) when we want to copy "other".
+    SpectralPrecond(const SpectralPrecond &other)
+        : m(other.m), k(other.k), s(other.s),
+          V(other.V), V_ptr(V.data()),
+          D(other.D), D_ptr(D.data()),
+          work(other.work), work_ptr(work.data()),
+          num_regs(other.num_regs) {} 
+
+    void prep(vector<T> &eigvecs, vector<T> &eigvals, vector<T> &mus, int64_t arg_s) {
+        // assume eigvals are positive numbers sorted in decreasing order.
+        num_regs = mus.size();
+        randblas_require(num_regs == 1 || num_regs == arg_s);
+        k = eigvals.size();
+        D.resize(k * num_regs);
+
+        s = arg_s;
+        V = eigvecs;
+        V_ptr = V.data();
+        work.resize(k * s);
+        work_ptr = work.data();
+
+        D_ptr = D.data();
+        for (int64_t r = 0; r < num_regs; ++r) {
+            T  mu_r = mus[r];
+            T* D_r  = &D_ptr[r*k];
+            T  numerator = eigvals[k-1] + mu_r;
+            for (int i = 0; i < k; ++i)
+                D_r[i] = (numerator / (eigvals[i] + mu_r)) - 1.0;
+        }
+        return;
+    }
+
+    void evaluate(int64_t s, const T *x, T *dest) {
+        operator()(blas::Layout::ColMajor, s, (T) 1.0, x, m, (T) 0.0, dest, m);
+        return;
+    }
+
+    void operator()(
+        blas::Layout layout, int64_t n, T alpha, const T* B, int64_t ldb, T beta, T* C, int64_t ldc
+    ) {
+        randblas_require(layout == blas::Layout::ColMajor);
+        randblas_require(ldb >= this->m);
+        randblas_require(ldc >= this->m);
+        if (this->num_regs != 1) {
+            randblas_require(n == num_regs);
+        } else {
+            randblas_require(this->s >= n);
+        }
+        // update C = alpha*(V diag(D) V' + I)B + beta*C
+        //      Step 1: w = V'B                    with blas::gemm
+        //      Step 2: w = D w                    with our own kernel
+        //      Step 3: C = beta * C + alpha * B   with blas::copy or blas::scal + blas::axpy
+        //      Step 4: C = alpha * V w + C        with blas::gemm
+        blas::gemm(layout, blas::Op::Trans, blas::Op::NoTrans, k, n, m, (T) 1.0, V_ptr, m, B, ldb, (T) 0.0, work_ptr, k);
+ 
+        // -----> start step 2
+        #define mat_D(_i, _j)    ((num_regs == 1) ? D_ptr[(_i)] : D_ptr[(_i) + k*(_j)])
+        #define mat_work(_i, _j) work_ptr[(_i) + k*(_j)]
+        for (int64_t j = 0; j < n; j++) {
+            for (int64_t i = 0; i < k; i++) {
+                mat_work(i, j) = mat_D(i, j) * mat_work(i, j);
+            }
+        }
+        #undef mat_D
+        #undef mat_work
+        // <----- end step 2
+
+        // -----> start step 3
+        int64_t i;
+        #define colB(_i) &B[(_i)*ldb]
+        #define colC(_i) &C[(_i)*ldb]
+        if (beta == (T) 0.0 && alpha == (T) 1.0) {
+            for (i = 0; i < n; ++i)
+                blas::copy(m, colB(i), 1, colC(i), 1);
+        } else {
+            for (i = 0; i < n; ++i) {
+                T* Ci = colC(i);
+                blas::scal(m, beta, Ci, 1);
+                blas::axpy(m, alpha, colB(i), 1, Ci, 1);
+            }
+        }
+        #undef colB
+        #undef colC
+        // <----- end step 3
+    
+        blas::gemm(layout, blas::Op::NoTrans, blas::Op::NoTrans, m, n, k, (T) 1.0, V_ptr, m, work_ptr, k, 1.0, C, ldc);
+        return;
+    }
 };
 
+// template <typename T>
+// struct ConjSpectralPrecond {
+//     public:
+//     using scalar_t = T;
+//     SpectralPrecond<T> spectral_precond;
+//     vector<T> ut_conjugator;
+
+//     ConjSpectralPrecond(SpectralPrecond<T> &sp, std::vector<T> &utc) 
+//         : spectral_precond(sp), ut_conjugator(utc) {}
+    
+//     ConjSpectralPrecond(SpectralPrecond<T> &&sp, std::vector<T> &&utc) 
+//         : spectral_precond(std::move(sp)), ut_conjugator(std::move(utc)) {}
+
+//     void operator()(
+//         blas::Layout layout, int64_t n, T alpha, const T* B, int64_t ldb, T beta, T* C, int64_t ldc
+//     ) {
+//         randblas_require(layout == blas::Layout::ColMajor);
+        
+//     }
+// };
 
-} // end namespace RandLAPACK
+} // end namespace RandLAPACK::linops
diff --git a/RandLAPACK/misc/rl_pdkernels.hh b/RandLAPACK/misc/rl_pdkernels.hh
new file mode 100644
index 00000000..5c90919d
--- /dev/null
+++ b/RandLAPACK/misc/rl_pdkernels.hh
@@ -0,0 +1,282 @@
+#ifndef randlapack_misc_pdkernels_h
+#define randlapack_misc_pdkernels_h
+
+#include "rl_blaspp.hh"
+#include "rl_linops.hh"
+#include <RandBLAS.hh>
+
+#include <iostream>
+#include <vector>
+#include <cstdint>
+#include <algorithm>
+#include <execution>
+#include <cmath>
+
+namespace RandLAPACK {
+
+/*** 
+ * X is a rows_x by cols_x matrix stored in column major format with
+ * leading dimension equal to rows_x. Each column of X is interpreted
+ * as a datapoint in "rows_x" dimensional space. mu and sigma are
+ * buffers of length rows_x. If use_input_mu_sigma is false then this
+ * function overwrites them as follows:
+ * 
+ *     mu(i) = [the sample mean of X(i,1), ..., X(i, end) ].
+ * 
+ *     sigma(i) = [the sample standard deviation of X(i,1), ..., X(i, end) ].
+ * 
+ * This function subtracts off a copy of "mu" from each column of X and 
+ * divides each row of X by the corresponding entry of sigma.
+ * On exit, each row of X has mean 0.0 and sample standard deviation 1.0.
+ * 
+ */
+template <typename T>
+void standardize_dataset(
+    int64_t rows_x, int64_t cols_x, T* X, T* mu, T* sigma, bool use_input_mu_sigma = false
+) {
+    randblas_require(cols_x >= 2);
+    if (! use_input_mu_sigma) {
+        std::fill(mu, mu + rows_x, (T) 0.0);
+        std::fill(sigma, sigma + rows_x, (T) 0.0);
+    }
+    T* ones_cols_x = new T[cols_x]{1.0};
+    blas::gemv(blas::Layout::ColMajor, blas::Op::NoTrans, rows_x, cols_x, 1.0/ (T)rows_x, X, rows_x, ones_cols_x, 1, (T) 0.0, mu, 1);
+    // ^ Computes the mean
+    blas::ger(blas::Layout::ColMajor, rows_x, cols_x, -1, mu, 1, ones_cols_x, 1, X, rows_x);
+    // ^ Performs a rank-1 update to subtract off the mean.
+    delete [] ones_cols_x;
+    // Up next: compute the sample standard deviations and rescale each row to have sample stddev = 1.
+    T stddev_scale = std::sqrt((T) (cols_x - 1));
+    for (int64_t i = 0; i < rows_x; ++i) {
+        sigma[i] = blas::nrm2(cols_x, X + i, rows_x);
+        sigma[i] /= stddev_scale;
+        blas::scal(cols_x, (T) 1.0 / sigma[i], X + i, rows_x);
+    }
+    return;
+}
+
+/***
+ * X is a rows_x by cols_x matrix stored in column major format with
+ * leading dimension equal to rows_x; sq_colnorms_x is a buffer of 
+ * length "cols_x" whose j-th entry is ||X(:,j)||_2^2.
+ * 
+ * The Euclidean distance matrix induced by X has entries
+ * 
+ *      E(i,j) = ||X(:,i) - X(:, J)||_2^2
+ * 
+ * This function computes the contiguous submatrix of E of dimensions
+ * rows_eds by cols_eds, whose upper-left corner is offset by
+ * (ro_eds, co_eds) from the upper-left corner of the full matrix E.
+ * 
+ * On exit, Eds contains that computed submatrix.
+ */
+template <typename T>
+void euclidean_distance_submatrix(
+    int64_t rows_x, int64_t cols_x, const T* X, const T* sq_colnorms_x,
+    int64_t rows_eds, int64_t cols_eds, T* Eds, int64_t ro_eds, int64_t co_eds
+) {
+    randblas_require((0 <= co_eds) && ((co_eds + cols_eds) <= cols_x));
+    randblas_require((0 <= ro_eds) && ((ro_eds + rows_eds) <= cols_x));
+    const T* sq_colnorms_for_rows = sq_colnorms_x + ro_eds;
+    const T* sq_colnorms_for_cols = sq_colnorms_x + co_eds;
+
+    std::vector<T> ones(rows_eds, 1.0);
+    T* ones_d = ones.data();
+    for (int64_t j = 0; j < cols_eds; ++j) {
+        T* Eds_col = Eds + rows_eds*j;
+        blas::copy(rows_eds, sq_colnorms_for_rows, 1, Eds_col, 1);
+        blas::axpy(rows_eds, sq_colnorms_for_cols[j], ones_d, 1, Eds_col, 1);
+    }
+
+    const T* X_subros = X + rows_x * ro_eds;
+    const T* X_subcos = X + rows_x * co_eds;
+    blas::gemm(
+        blas::Layout::ColMajor, blas::Op::Trans, blas::Op::NoTrans,
+        rows_eds, cols_eds, rows_x,
+        -2.0, X_subros, rows_x, X_subcos, rows_x, 1.0, Eds, rows_eds
+    );
+    return;
+}
+
+template <typename T>
+T squared_exp_kernel(int64_t dim, const T* x, const T* y, T bandwidth) {
+    T sq_nrm = 0.0;
+    T scale = std::sqrt(2.0)*bandwidth;
+    for (int64_t i = 0; i < dim; ++i) {
+        T diff = (x[i] - y[i])/scale;
+        sq_nrm += diff*diff;
+    }
+    return std::exp(-sq_nrm);
+}
+
+/***
+ * X is a rows_x by cols_x matrix stored in column major format with
+ * leading dimension equal to rows_x; sq_colnorms_x is a buffer of 
+ * length "cols_x" whose j-th entry is ||X(:,j)||_2^2.
+ * 
+ * The squared exponential kernel with scale given by "bandwidth" is
+ * a matrix of the form
+ * 
+ *      K(i, j) = exp(- ||X(:,i) - X(:, J)||_2^2 / (2*bandwidth^2))
+ * 
+ * That is -- each column of X defines a datapoint, and K is the induced
+ * positive (semi)definite kernel matrix.
+ * 
+ * This function computes the contiguous submatrix of K of dimensions
+ * rows_ksub by cols_ksub, whose upper-left corner is offset by
+ * (ro_ksub, co_ksub) from the upper-left corner of the full matrix K.
+ * 
+ * The result is stored in "Ksub", which is interpreted in column-major
+ * order with leading dimension equal to rows_ksub.
+ */
+template <typename T>
+void squared_exp_kernel_submatrix(
+    int64_t rows_x, int64_t cols_x, const T* X, T* sq_colnorms_x,
+    int64_t rows_ksub, int64_t cols_ksub,  T* Ksub, int64_t ro_ksub, int64_t co_ksub,
+    T bandwidth
+) {
+    int64_t size_Ksub = rows_ksub * cols_ksub;
+    randblas_require(bandwidth > 0);
+    euclidean_distance_submatrix(rows_x, cols_x, X, sq_colnorms_x, rows_ksub, cols_ksub, Ksub, ro_ksub, co_ksub);
+    T scale = -1.0 / (2.0 * bandwidth * bandwidth);
+    auto inplace_exp = [scale](T &val) { val = std::exp(scale*val); };
+    #pragma omp parallel for
+    for (int64_t i = 0; i < size_Ksub; ++i) {
+        inplace_exp(Ksub[i]);
+    }
+    return;
+}
+
+
+/**
+ *  D = [A ][ B ] C
+ *      [B'][ 0 ]
+ * 
+ * where A is k-by-k, B is k-by-ell, and C has n columns.
+ * 
+ * All matrices are column-major; A and B have leading dimension k. d
+ * 
+ */
+template <typename T>
+void block_arrowhead_multiply(int64_t k, int64_t ell, int64_t n, const T* A, const T* B, const T* C, int64_t ldc, T* D, int64_t ldd ) {
+    auto layout = blas::Layout::ColMajor;
+    using blas::Op;
+    const T* C_top = C;
+    const T* C_bot = C + k;
+    T* D_top = D;
+    T* D_bot = D + k;
+    //
+    //  Step 1. D_top += alpha * A * C_top
+    //
+    blas::gemm(layout, Op::NoTrans, Op::NoTrans, k, n, k, (T) 1.0, A, k, C_top, ldc, (T) 0.0, D_top, ldd);
+    if (ell > 0) {
+        //
+        //  Step 2. D_top += alpha * B * C_bot
+        //
+        blas::gemm(layout, Op::NoTrans, Op::NoTrans, k, n, ell, (T) 1.0, B, k, C_bot, ldc, (T) 1.0, D_top, ldd);
+        //
+        // Step 3. D_bot += alpha * B' * C_top
+        //
+        blas::gemm(layout, Op::Trans,   Op::NoTrans, ell, n, k, (T) 1.0, B, k, C_top, ldc, (T) 0.0, D_bot, ldd);
+    }
+    return;
+}
+
+
+namespace linops {
+
+/***
+ * It might be practical to have one class that handles several different kinds of kernels.
+ */
+template <typename T>
+struct RBFKernelMatrix : public SymmetricLinearOperator<T> {
+    // squared exp kernel linear operator
+    const T* X;
+    const int64_t rows_x;
+    T bandwidth;
+    vector<T> regs;
+
+    vector<T> _sq_colnorms_x;
+    vector<T> _eval_work1;
+    vector<T> _eval_work2;
+    bool      _eval_includes_reg;
+    int64_t   _eval_block_size;
+
+    using scalar_t = T;
+
+    RBFKernelMatrix(
+        int64_t m, const T* X, int64_t rows_x, T bandwidth, vector<T> &regs
+    ) : SymmetricLinearOperator<T>(m), X(X), rows_x(rows_x), bandwidth(bandwidth),  regs(regs), _sq_colnorms_x(m), _eval_work1{}, _eval_work2{} {
+        for (int64_t i = 0; i < m; ++i) {
+            _sq_colnorms_x[i] = std::pow(blas::nrm2(rows_x, X + i*rows_x, 1), 2);
+        }
+        _eval_block_size = std::min(m / ((int64_t) 4), (int64_t) 512);
+        _eval_work1.resize(_eval_block_size * m);
+        _eval_includes_reg = false;
+        return;
+    }
+
+    void _prep_eval_work1(int64_t rows_ksub, int64_t cols_ksub, int64_t ro_ksub, int64_t co_ksub) {
+        randblas_require(rows_ksub * cols_ksub <= (int64_t) _eval_work1.size());
+        squared_exp_kernel_submatrix(
+            rows_x, this->m, X, _sq_colnorms_x.data(),
+            rows_ksub, cols_ksub, _eval_work1.data(), ro_ksub, co_ksub, bandwidth
+        );
+    }
+
+    void set_eval_includes_reg(bool eir) {
+        _eval_includes_reg = eir;
+    }
+
+    void operator()(blas::Layout layout, int64_t n, T alpha, T* const B, int64_t ldb, T beta, T* C, int64_t ldc) {
+        randblas_require(layout == blas::Layout::ColMajor);
+        randblas_require(ldb >= this->m);
+        randblas_require(ldc >= this->m);
+
+        _eval_work2.resize(this->m * n);
+        for (int64_t i = 0; i < n; ++i) {
+            blas::scal(this->m, beta, C + i*ldc, 1);
+        }
+        int64_t done = 0;
+        int64_t todo = this->m;
+        while (todo > 0) {
+            int64_t k = std::min(_eval_block_size, todo);
+            _prep_eval_work1(k, todo, done, done);
+            const T* arrowhead_A = _eval_work1.data();
+            const T* arrowhead_B = arrowhead_A + k * k;
+            const T* arrowhead_C = B + done;
+            T* arrowhead_D = _eval_work2.data();
+            int64_t ell = (todo > k) ? (todo - k) : 0;
+            block_arrowhead_multiply(k, ell, n, arrowhead_A, arrowhead_B, arrowhead_C, ldb, arrowhead_D, todo);
+            for (int i = 0; i < n; ++i) {
+                blas::axpy(todo, alpha, arrowhead_D + i*todo, 1, C + done + i*ldc, 1);
+            }
+            done += k;
+            todo -= k;
+        }
+        if (_eval_includes_reg) {
+            int64_t num_regs = this->regs.size();
+            randblas_require(num_regs == 1 || n == num_regs);
+            T* regsp = regs.data();
+            for (int64_t i = 0; i < n; ++i) {
+                T coeff =  alpha * regsp[std::min(i, num_regs - 1)];
+                blas::axpy(this->m, coeff, B + i*ldb, 1, C +  i*ldc, 1);
+            }
+        }
+        return;
+    }
+
+    inline T operator()(int64_t i, int64_t j) {
+        T val = squared_exp_kernel(rows_x, X + i*rows_x, X + j*rows_x, bandwidth);
+        if (_eval_includes_reg) {
+            randblas_require(regs.size() == 1);
+            val += regs[0];
+        }
+        return val;
+    }
+};
+
+} // end namespace RandLAPACK::linops
+
+}
+#endif
\ No newline at end of file
diff --git a/RandLAPACK/misc/rl_util.hh b/RandLAPACK/misc/rl_util.hh
index 15f516ae..eba68d9d 100644
--- a/RandLAPACK/misc/rl_util.hh
+++ b/RandLAPACK/misc/rl_util.hh
@@ -273,7 +273,7 @@ T estimate_spectral_norm(
     std::vector<T> buf1 (m, 0.0);
 
     RandBLAS::DenseDist DV(n, 1);
-    state = RandBLAS::fill_dense(DV, buf.data(), state).second;
+    state = RandBLAS::fill_dense(DV, buf.data(), state);
 
     T prev_norm_inv = 1.0;
     for(int i = 0; i < p; ++i) {
diff --git a/benchmark/CMakeLists.txt b/benchmark/CMakeLists.txt
index 8be3b062..dd17e750 100644
--- a/benchmark/CMakeLists.txt
+++ b/benchmark/CMakeLists.txt
@@ -1,4 +1,7 @@
-cmake_minimum_required(VERSION 3.10)
+cmake_minimum_required(VERSION 3.27)
+
+# cmake -DCMAKE_BUILD_TYPE=Release -DRandom123_DIR=`pwd`/../../../random123-install/include/  -Dblaspp_DIR=`pwd`/../../../blaspp-install/lib/cmake/blaspp/  -DRandLAPACK_DIR=`pwd`/../../../RandLAPACK-install/lib/cmake/ -Dlapackpp_DIR=`pwd`/../../../lapackpp-install/lib/cmake/lapackpp/  -DCMAKE_BINARY_DIR=`pwd`   ..
+# ^ Example CMake configuration line with "install," "build" and "randlibs" folders.
 
 # cmake -DCMAKE_BUILD_TYPE=Release -DRandom123_DIR=`pwd`/../../install/random123-install/include/  -Dblaspp_DIR=`pwd`/../../install/blaspp_GPU-install/lib64/blaspp/  -DRandLAPACK_DIR=`pwd`/../../install/RandLAPACK-install/lib64/cmake/ -Dlapackpp_DIR=`pwd`/../../install/lapackpp-install/lib64/lapackpp/  -DCMAKE_BINARY_DIR=`pwd`     ../../randlibs/RandLAPACK/benchmark/
 # ^ Example CMake configuration line with "install," "build" and "randlibs" folders.
@@ -39,7 +42,7 @@ function(add_benchmark)
     set(MVO CXX_SOURCES LINK_LIBS)
     cmake_parse_arguments(PARSE_ARGV 0 TGT "${OPTS}" "${NVPO}" "${MVO}")
     add_executable(${TGT_NAME} ${TGT_CXX_SOURCES})
-    target_compile_options(${TGT_NAME} PRIVATE "-g")
+    target_compile_options(${TGT_NAME} PUBLIC -O1)
     target_include_directories(${TGT_NAME} PUBLIC ${Benchmark_include_dirs})
     target_link_libraries(${TGT_NAME} ${TGT_LINK_LIBS})
     message(STATUS "RandLAPACK: added ${TGT_NAME} benchmark")
@@ -108,3 +111,17 @@ add_benchmark(NAME ICQRRP_subroutines_speed      CXX_SOURCES bench_CQRRP/ICQRRP_
 add_benchmark(NAME RBKI_speed_comparisons      CXX_SOURCES bench_RBKI/RBKI_speed_comparisons.cc      LINK_LIBS ${Benchmark_libs})
 add_benchmark(NAME RBKI_runtime_breakdown      CXX_SOURCES bench_RBKI/RBKI_runtime_breakdown.cc      LINK_LIBS ${Benchmark_libs})
 add_benchmark(NAME RBKI_speed_comparisons_SVDS CXX_SOURCES bench_RBKI/RBKI_speed_comparisons_SVDS.cc LINK_LIBS ${Benchmark_libs_external})
+
+
+
+# KRILL benchmarks
+include(FetchContent)
+FetchContent_Declare(
+    fast_matrix_market
+    GIT_REPOSITORY https://github.com/alugowski/fast_matrix_market
+    GIT_TAG main
+    GIT_SHALLOW TRUE
+)
+FetchContent_MakeAvailable(fast_matrix_market)
+add_benchmark(NAME KRR_simple  CXX_SOURCES bench_kernelalgs/kernelbench_common.hh bench_kernelalgs/krr.cc  LINK_LIBS ${Benchmark_libs} fast_matrix_market::fast_matrix_market)
+add_benchmark(NAME KPCA_simple CXX_SOURCES bench_kernelalgs/kernelbench_common.hh bench_kernelalgs/kpca.cc LINK_LIBS ${Benchmark_libs} fast_matrix_market::fast_matrix_market)
diff --git a/benchmark/bench_kernelalgs/kernelbench_common.hh b/benchmark/bench_kernelalgs/kernelbench_common.hh
new file mode 100644
index 00000000..eeaeb1a6
--- /dev/null
+++ b/benchmark/bench_kernelalgs/kernelbench_common.hh
@@ -0,0 +1,251 @@
+#pragma once
+
+#include <blas.hh>
+#include <lapack.hh>
+#include <RandBLAS.hh>
+#include <RandLAPACK.hh>
+
+#include <chrono>
+#include <unordered_map>
+#include <iomanip> 
+#include <limits> 
+#include <numbers>
+
+#include <iostream>
+#include <fstream>
+
+#include <fast_matrix_market/fast_matrix_market.hpp>
+
+using std_clock = std::chrono::high_resolution_clock;
+using timepoint_t = std::chrono::time_point<std_clock>;
+using std::chrono::duration_cast;
+using std::chrono::microseconds;
+
+using RandBLAS::RNGState;
+using RandLAPACK::rp_cholesky;
+using lapack::gesdd;
+using lapack::Job;
+using std::vector;
+
+double sec_elapsed(timepoint_t tp0, timepoint_t tp1) {
+    return ((double) duration_cast<microseconds>(tp1 - tp0).count())/1e6;
+}
+
+template <typename T>
+void transpose_colmajor(
+    int64_t m, int64_t n, const T* A, int64_t lda, T* AT, int64_t ldat
+) {
+    for(int i = 0; i < n; ++i)
+        blas::copy(m, &A[i * lda], 1, &AT[i], ldat);
+}
+
+
+struct array_matrix {
+    int64_t nrows = 0, ncols = 0;
+    std::vector<double> vals;
+};
+
+struct KRR_data {
+    array_matrix X_train;
+    array_matrix Y_train;
+    array_matrix X_test;
+    array_matrix Y_test;
+};
+
+void standardize(KRR_data &krrd) {
+    randblas_require(krrd.X_train.nrows == krrd.X_test.nrows);
+    using T = double;
+    int64_t d = krrd.X_train.nrows;
+    std::vector<T> mu(d, 0.0);
+    std::vector<T> sigma(d, 0.0);
+    RandLAPACK::standardize_dataset(
+        d, krrd.X_train.ncols, krrd.X_train.vals.data(), mu.data(), sigma.data(), false
+    );
+    RandLAPACK::standardize_dataset(
+        d, krrd.X_test.ncols, krrd.X_test.vals.data(), mu.data(), sigma.data(), true
+    );
+    return;
+}
+
+array_matrix mmread_file(std::string fn, bool transpose = true) {
+    array_matrix mat{};
+    std::ifstream file_stream(fn);
+    fast_matrix_market::read_matrix_market_array(
+        file_stream, mat.nrows, mat.ncols, mat.vals, fast_matrix_market::col_major
+    );
+    if (transpose) {
+        array_matrix tmat{};
+        tmat.nrows = mat.ncols;
+        tmat.ncols = mat.nrows;
+        tmat.vals.resize(mat.vals.size(), 0.0);
+        transpose_colmajor(
+            mat.nrows, mat.ncols, mat.vals.data(), mat.nrows, tmat.vals.data(), tmat.nrows
+        );
+        return tmat;
+    } else {
+        return mat;   
+    }
+}
+
+KRR_data mmread_krr_data_dir(std::string dn) {
+    // mmread_file calls below always apply a transpose; might need to skip transposition for some
+    // datasets.
+    KRR_data data{};
+    data.X_train = mmread_file(dn + "/Xtr.mm");
+    data.Y_train = mmread_file(dn + "/Ytr.mm");
+    data.X_test  = mmread_file(dn + "/Xts.mm");
+    data.Y_test  = mmread_file(dn + "/Yts.mm");
+    standardize(data);
+    return data;
+}
+
+namespace memprof {
+/*
+ * Author:  David Robert Nadeau
+ * Site:    http://NadeauSoftware.com/
+ * License: Creative Commons Attribution 3.0 Unported License
+ *          http://creativecommons.org/licenses/by/3.0/deed.en_US
+ */
+
+#if defined(_WIN32)
+#include <psapi.h>
+#include <windows.h>
+
+#elif defined(__unix__) || defined(__unix) || defined(unix) ||                 \
+    (defined(__APPLE__) && defined(__MACH__))
+#include <sys/resource.h>
+#include <unistd.h>
+
+#if defined(__APPLE__) && defined(__MACH__)
+#include <mach/mach.h>
+
+#elif (defined(_AIX) || defined(__TOS__AIX__)) ||                              \
+    (defined(__sun__) || defined(__sun) ||                                     \
+     defined(sun) && (defined(__SVR4) || defined(__svr4__)))
+#include <fcntl.h>
+#include <procfs.h>
+
+#elif defined(__linux__) || defined(__linux) || defined(linux) ||              \
+    defined(__gnu_linux__)
+#include <stdio.h>
+
+#endif
+
+#else
+#error "Cannot define getPeakRSS( ) or getCurrentRSS( ) for an unknown OS."
+#endif
+
+/**
+ * Returns the peak (maximum so far) resident set size (physical
+ * memory use) measured in bytes, or zero if the value cannot be
+ * determined on this OS.
+ */
+inline size_t getPeakRSS() {
+#if defined(_WIN32)
+  /* Windows -------------------------------------------------- */
+  PROCESS_MEMORY_COUNTERS info;
+  GetProcessMemoryInfo(GetCurrentProcess(), &info, sizeof(info));
+  return (size_t)info.PeakWorkingSetSize;
+
+#elif (defined(_AIX) || defined(__TOS__AIX__)) ||                              \
+    (defined(__sun__) || defined(__sun) ||                                     \
+     defined(sun) && (defined(__SVR4) || defined(__svr4__)))
+  /* AIX and Solaris ------------------------------------------ */
+  struct psinfo psinfo;
+  int fd = -1;
+  if ((fd = open("/proc/self/psinfo", O_RDONLY)) == -1)
+    return (size_t)0L; /* Can't open? */
+  if (read(fd, &psinfo, sizeof(psinfo)) != sizeof(psinfo)) {
+    close(fd);
+    return (size_t)0L; /* Can't read? */
+  }
+  close(fd);
+  return (size_t)(psinfo.pr_rssize * 1024L);
+
+#elif defined(__unix__) || defined(__unix) || defined(unix) ||                 \
+    (defined(__APPLE__) && defined(__MACH__))
+  /* BSD, Linux, and OSX -------------------------------------- */
+  struct rusage rusage;
+  getrusage(RUSAGE_SELF, &rusage);
+#if defined(__APPLE__) && defined(__MACH__)
+  return (size_t)rusage.ru_maxrss;
+#else
+  return (size_t)(rusage.ru_maxrss * 1024L);
+#endif
+
+#else
+  /* Unknown OS ----------------------------------------------- */
+  return (size_t)0L; /* Unsupported. */
+#endif
+}
+
+/**
+ * Returns the current resident set size (physical memory use) measured
+ * in bytes, or zero if the value cannot be determined on this OS.
+ */
+inline size_t getCurrentRSS() {
+#if defined(_WIN32)
+  /* Windows -------------------------------------------------- */
+  PROCESS_MEMORY_COUNTERS info;
+  GetProcessMemoryInfo(GetCurrentProcess(), &info, sizeof(info));
+  return (size_t)info.WorkingSetSize;
+
+#elif defined(__APPLE__) && defined(__MACH__)
+  /* OSX ------------------------------------------------------ */
+  struct mach_task_basic_info info;
+  mach_msg_type_number_t infoCount = MACH_TASK_BASIC_INFO_COUNT;
+  if (task_info(mach_task_self(), MACH_TASK_BASIC_INFO, (task_info_t)&info,
+                &infoCount) != KERN_SUCCESS)
+    return (size_t)0L; /* Can't access? */
+  return (size_t)info.resident_size;
+
+#elif defined(__linux__) || defined(__linux) || defined(linux) ||              \
+    defined(__gnu_linux__)
+  /* Linux ---------------------------------------------------- */
+  long rss = 0L;
+  FILE *fp = NULL;
+  if ((fp = fopen("/proc/self/statm", "r")) == NULL)
+    return (size_t)0L; /* Can't open? */
+  if (fscanf(fp, "%*s%ld", &rss) != 1) {
+    fclose(fp);
+    return (size_t)0L; /* Can't read? */
+  }
+  fclose(fp);
+  return (size_t)rss * (size_t)sysconf(_SC_PAGESIZE);
+
+#else
+  /* AIX, BSD, Solaris, and Unknown OS ------------------------ */
+  return (size_t)0L; /* Unsupported. */
+#endif
+}
+
+// inline void log_pages() {
+//     static size_t pagesize = sysconf(_SC_PAGESIZE);
+//     int64_t bytes = getCurrentRSS();
+//     assert((bytes % pagesize) == 0);
+//     size_t pages = bytes / pagesize;
+//     std::cout << "page size: " << pagesize << "\t";
+//     std::cout << "bytes: " << bytes << "\t";
+//     std::cout << "pages: " << pages << std::endl;
+//     return;
+// }
+
+inline void log_pages(std::ostream &stream) {
+    static size_t pagesize = sysconf(_SC_PAGESIZE);
+    int64_t bytes = getCurrentRSS();
+    assert((bytes % pagesize) == 0);
+    size_t pages = bytes / pagesize;
+    stream << "page size: " << pagesize << "\t";
+    stream << "bytes: " << bytes << "\t";
+    stream << "pages: " << pages << std::endl;
+    return;
+}
+
+inline void log_memory_gb(std::ostream &stream) {
+    int64_t bytes = getCurrentRSS();
+    double gb = ((double) bytes) / ((double) std::pow(1024,3));
+    stream << " Memory (GB)  : " << gb << "\n";
+    return;
+}
+
+}
\ No newline at end of file
diff --git a/benchmark/bench_kernelalgs/kpca.cc b/benchmark/bench_kernelalgs/kpca.cc
new file mode 100644
index 00000000..547db1f3
--- /dev/null
+++ b/benchmark/bench_kernelalgs/kpca.cc
@@ -0,0 +1,140 @@
+
+#include "kernelbench_common.hh"
+#include <RandLAPACK.hh>
+#include <RandBLAS.hh>
+#include <lapack.hh>
+#include <blas.hh>
+
+
+#ifndef DOUT
+#define DOUT(_d) std::setprecision(8) << _d
+#endif
+
+using RandLAPACK::rp_cholesky;
+using blas::Layout;
+using lapack::gesdd;
+using lapack::Job;
+using std::vector;
+
+
+
+template <typename T>
+int cholsvd_square(int64_t m, int64_t n, T* A, int64_t lda, T* singvals_squared, T* work) {
+    auto layout = Layout::ColMajor;
+    auto uplo = blas::Uplo::Lower;
+    blas::syrk(layout, uplo, blas::Op::Trans, n, m, (T)1.0, A, lda, 0.0, work, n);
+    lapack::syevd(Job::Vec, uplo, n, work, n, singvals_squared);
+    // The first n*n entries in work hold the right singular vectors of A.
+    // But they're sorted in the wrong order!
+    for (int64_t j = 0; j < n/2; ++j) {
+        auto lead_off  = j;
+        auto trail_off = n-j-1;
+        T* colj       = work +   lead_off * n;
+        T* coljtrail  = work +  trail_off * n;
+        for (int64_t i = 0; i < n; ++i) {
+            std::swap(colj[i], coljtrail[i]);
+        }
+        std::swap(singvals_squared[lead_off], singvals_squared[trail_off]);
+    }
+    T* trailing_work = work + n*n;
+    lapack::lacpy(lapack::MatrixType::General, m, n, A, m, trailing_work, m);
+    // trailing_work is a copy of A.
+    blas::gemm(layout, blas::Op::NoTrans, blas::Op::NoTrans, m, n, n, (T)1.0, trailing_work, m, work, n, (T)0.0, A, lda);
+    // invert the scale on each column of A.
+    for (int64_t i = 0; i < n; ++i)
+        blas::scal(m, (T) std::pow(singvals_squared[i], -0.5), A + i*lda, 1);
+    return 0;
+}
+
+enum TSSVD : char {
+    GESDD    = 'G',
+    CholSVD  = 'C',
+    RandPrecondCholSVD = 'R'
+};
+
+template <typename T, typename CALLBACK>
+std::pair<timepoint_t,timepoint_t> convert_svd(int64_t m, int64_t rank, vector<T> &U, vector<T> &kevals, TSSVD cs, CALLBACK &cb) {
+    auto _tp0 = std_clock::now();
+    if (cs == TSSVD::GESDD) {
+        vector<T> work(rank*rank, 0.0);
+        gesdd(Job::OverwriteVec, m, rank, U.data(), m, kevals.data(), nullptr, 1, work.data(), rank);
+        for (int64_t i = 0; i < rank; ++i)
+            kevals[i] = std::pow(kevals[i], 2);
+        cb(0);
+    } else if (cs == TSSVD::CholSVD) {
+        vector<T> work((rank + m)*rank, 0.0);
+        cholsvd_square(m, rank, U.data(), m, kevals.data(), work.data());
+        cb(0);
+    }
+    auto _tp1 = std_clock::now();
+    return {_tp0, _tp1};
+}
+
+
+int main() {
+    //std::string dn{"/Users/rjmurr/Documents/open-data/kernel-ridge-regression/sensit_vehicle"};
+    std::string dn{"/Users/rjmurr/Documents/open-data/kernel-ridge-regression/cod-rna"};
+    auto krrd = mmread_krr_data_dir(dn);
+    using T = double;
+    int64_t m = krrd.X_train.ncols;
+    int64_t d = krrd.X_train.nrows;
+    std::cout << "\nDataset\n " << dn << std::endl;
+    std::cout << " cols : " << m << std::endl; 
+    std::cout << " rows : " << d << "\n\n";
+    vector<T> mus{0.0};
+    RandLAPACK::linops::RBFKernelMatrix K_reg(m, krrd.X_train.vals.data(), d, 3.0, mus);
+    K_reg.set_eval_includes_reg(false);
+
+    // Variables for RPCholesky
+    int64_t rpchol_block_size = 64;
+    int64_t rank = (int64_t) std::sqrt(m);
+    vector<T> U(m * rank, 0.0);
+    RNGState state(0);
+    vector<int64_t> selection(rank, -1);
+
+    std::stringstream strm{};
+    auto callback = [&strm](int64_t i) { memprof::log_memory_gb(strm); return i;};
+
+    std::cout << "RPCholesky (RPC)\n";
+    std::cout << " block size   : " << rpchol_block_size << std::endl;
+    std::cout << " rank limit   : " << rank << std::endl;
+    auto _tp0 = std_clock::now();
+    state = rp_cholesky(m, K_reg, rank, selection.data(), U.data(), rpchol_block_size, state, callback);
+    auto _tp1 = std_clock::now();
+    std::cout << " exit rank    : " << rank << std::endl;
+    std::cout << " RPC time (s) : " << DOUT(sec_elapsed(_tp0, _tp1)) << std::endl;
+    std::cout << strm.str();
+
+    strm.str("");
+    strm.clear();
+
+    // Variables for SVD conversion
+    //      We don't allocate these earlier, since "rank" might have decreased
+    //      in the call to rp_cholesky.
+    vector<T> kevals(rank, 0.0);
+{
+    auto [tp0, tp1] = convert_svd(m, rank, U, kevals, TSSVD::CholSVD, callback);
+    std::cout << " SVD time (s) : " << DOUT(sec_elapsed(tp0, tp1)) << "\n";
+    std::cout << strm.str() << "\n";
+}
+    // Now check || K_reg @ U[:, 0:num_pc] - U[:,0:num_pc] @ diag(eivals[0:num_pc]) ||,
+    //        or || K_reg @ U[:, 0:num_pc] @ inv(diag(eigvals[0:num_pc])) - U[:,0:num_pc]||
+    int64_t num_pc = 5;
+    vector<T> V(m*num_pc, 0.0);
+    T onef = 1.0;
+    K_reg(blas::Layout::ColMajor, num_pc, onef, U.data(), m, (T)0.0, V.data(), m);
+    for (int64_t i = 0; i < num_pc; ++i)
+        blas::scal(m, onef/kevals[i], V.data() + i*m, 1);
+    // ^ Now, V = K_reg @ U[:, 0:num_pc] @ inv(diag(eigvals[0:num_pc]))
+    vector<T> W(V);
+    // subtract off U
+    for (int64_t i = 0; i < m*num_pc; ++i)
+        W[i] -= U[i];
+    // compute column norms of W.
+    std::cout << "Error in KPCA components " << std::endl;
+    for (int64_t i = 0; i < num_pc; ++i) {
+        std::cout << " component " << i << " : " << DOUT(blas::nrm2(m, W.data()+i*m, 1)) << std::endl;
+    }
+    std::cout << std::endl;
+    return 0;
+}
diff --git a/benchmark/bench_kernelalgs/krr.cc b/benchmark/bench_kernelalgs/krr.cc
new file mode 100644
index 00000000..f0e85cfe
--- /dev/null
+++ b/benchmark/bench_kernelalgs/krr.cc
@@ -0,0 +1,74 @@
+
+#include "kernelbench_common.hh"
+
+#ifndef DOUT
+#define DOUT(_d) std::setprecision(std::numeric_limits<double>::max_digits10) << _d
+#endif
+
+#ifndef TIMED_LINE
+#define TIMED_LINE(_op, _name) { \
+        auto _tp0 = std_clock::now(); \
+        _op; \
+        auto _tp1 = std_clock::now(); \
+        auto dtime = sec_elapsed(_tp0, _tp1); \
+        std::cout << _name << DOUT(dtime) << std::endl; \
+        }
+#endif
+
+int main() {
+    //std::string dn{"/Users/rjmurr/Documents/open-data/kernel-ridge-regression/sensit_vehicle"};
+    std::string dn{"/Users/rjmurr/Documents/open-data/kernel-ridge-regression/cod-rna"};
+    auto krrd = mmread_krr_data_dir(dn);
+    using T = double;
+    int64_t m = krrd.X_train.ncols;
+    int64_t d = krrd.X_train.nrows;
+    std::cout << "cols  : " << m << std::endl; 
+    std::cout << "rows  : " << d << std::endl;
+    T mu_min = m * 1e-7;
+    vector<T> mus{mu_min};
+    RandLAPACK::linops::RBFKernelMatrix A_linop(m, krrd.X_train.vals.data(), d, 3.0, mus);
+    for (int64_t s = 1; s <= 8; s*=2) {
+        vector<T> H(m*s, 0.0);
+
+        T* Hd = H.data();
+        T* hd = krrd.Y_train.vals.data();
+        blas::copy(m, hd, 1, Hd, 1);
+        if (s > 1) {
+            RNGState state_H(1);
+            RandBLAS::DenseDist D(m, s - 1, RandBLAS::ScalarDist::Gaussian);
+            RandBLAS::fill_dense(D, Hd + m, state_H);
+            T nrm_h = blas::nrm2(m, hd, 1);
+            for (int i = 1; i < s; ++i) {
+                // T nrm_Hi = blas::nrm2(m, Hd + i*m, 1);
+                // T scale = std::pow(2.0*nrm_Hi, -1); 
+                // blas::scal(m, scale, Hd + i*m, 1);
+                // blas::axpy(m, 1.0, hd, 1, Hd + i*m, 1);
+                T nrm_Hi = blas::nrm2(m, Hd + i*m, 1);
+                T scale = nrm_h / nrm_Hi;
+                blas::scal(m, scale, Hd + i*m, 1);
+            }
+        }
+
+        vector<T> X(m*s, 0.0);
+        // solve A_linop X == H
+        RNGState state(0);
+        auto seminorm = [](int64_t n, int64_t s, const T* NR){return blas::nrm2(n, NR, 1);};
+        int64_t k = 2*1024;
+        int64_t rpc_b = 64;
+        int64_t eval_block_size = 1024;
+        std::cout << "k     : " << k << std::endl;
+        std::cout << "s     : " << s << std::endl;
+        std::cout << "mu0   : " << mu_min << std::endl;
+        std::cout << "rpc_b : " << rpc_b << std::endl << std::endl;
+        T tol = std::pow(std::numeric_limits<T>::epsilon(), 0.75);
+        int64_t max_iters = 25;
+        A_linop._eval_block_size = eval_block_size;
+        A_linop._eval_work1.resize(A_linop._eval_block_size * m);
+        TIMED_LINE(
+        RandLAPACK::krill_full_rpchol(
+            m, A_linop, H, X, tol, state, seminorm, rpc_b, max_iters, k
+        );, "\nKrill : ")
+        std::cout << std::endl;
+    }
+    return 0;
+}
diff --git a/benchmark/bench_kernelalgs/logging.txt b/benchmark/bench_kernelalgs/logging.txt
new file mode 100644
index 00000000..3e95f9ae
--- /dev/null
+++ b/benchmark/bench_kernelalgs/logging.txt
@@ -0,0 +1,173 @@
+
+KPCA
+====
+Our implementation
+    Dataset
+     /Users/rjmurr/Documents/open-data/kernel-ridge-regression/cod-rna
+     cols : 59535
+     rows : 8
+
+    RPCholesky (RPC)
+     block size   : 64
+     rank limit   : 243
+     exit rank    : 243
+     RPC time (s) : 0.058426
+     SVD time (s) : 0.056271
+
+    Error in KPCA components 
+     component 0 : 3.165351e-10
+     component 1 : 2.4040413e-08
+
+Python implementations
+
+    Dataset dimensions (RandLAPACK's convention)
+    n_rows : 8
+    n_cols : 59535
+    44.33819890022278 seconds for sklearn's KPCA.
+
+
+    Dataset dimensions (RandLAPACK's convention)
+    n_rows : 8
+    n_cols : 59535
+    0.3426549434661865 seconds for Ethan's RPCholesky, with block size 64.
+
+
+
+KRR
+===
+CONLUSIONS
+
+1. Performance is best with -O1 (13 seconds vs 17 seconds from -O0, but same result as -O0).
+
+2. The nature of the preconditioner varies (specifically, where we encounter a Cholesky failure,
+   and hence the value of the preconditioner's rank) depending on if we use -O2 or -O3, 
+   *provided* we *don't* have the "-fsanitize=undefined" flag.
+
+NEXT STEPS
+
+* Easy: run the larger guy and see how much faster we are with -O1.
+
+* Medium: export the rp_cholesky preconditioner from MATLAB into a matrixmarket file.
+  Make a script to run lockorblock_pcg using the SpectralPrecond induced by that matrix.
+
+////////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////
+////////////////////////////////////////////////////////////
+
+
+////// Debug (evidently the same as BLANK) //////////
+
+(rb311) (base) s1104997ca:build rjmurr$ ./KRILL_simple 
+cols  : 59535
+rows  : 8
+k     : 1024
+s     : 1
+mu0   : 0.00059535
+rpc_b : 64
+
+Cholesky failed with exit code 40.
+Returning early, with approximation rank = 956
+
+normNR : 113.48 normR : 243.998 k: 0    dim : 0
+normNR : 0.000378058    normR : 0.000443416     k: 1    dim : 1
+normNR : 3.85744e-09    normR : 4.38204e-09     k: 2    dim : 2
+normNR : 3.71858e-14    normR : 4.1411e-14      k: 3    dim : 3
+
+Krill : 16.454101000000001
+
+
+////// Release //////////
+
+(rb311) (base) s1104997ca:build rjmurr$ ./KRILL_simple 
+cols  : 59535
+rows  : 8
+k     : 1024
+s     : 1
+mu0   : 0.00059535
+rpc_b : 64
+
+Cholesky failed with exit code 44.
+Returning early, with approximation rank = 319
+
+normNR : 113.51 normR : 243.998 k: 0    dim : 0
+normNR : 0.208441       normR : 0.281245        k: 1    dim : 1
+normNR : 0.00179702     normR : 0.00254381      k: 2    dim : 2
+normNR : 1.22709e-05    normR : 1.7396e-05      k: 3    dim : 3
+normNR : 1.25328e-07    normR : 1.79491e-07     k: 4    dim : 4
+normNR : 1.00295e-09    normR : 1.49707e-09     k: 5    dim : 5
+normNR : 1.06377e-11    normR : 1.4495e-11      k: 6    dim : 6
+
+Krill : 16.997389999999999
+
+
+////// RelWithDebInfo ///// DEFAULT, -O2 //////////
+
+(rb311) (base) s1104997ca:build rjmurr$ ./KRILL_simple 
+cols  : 59535
+rows  : 8
+k     : 1024
+s     : 1
+mu0   : 0.00059535
+rpc_b : 64
+
+Cholesky failed with exit code 44.
+Returning early, with approximation rank = 319
+
+normNR : 113.51 normR : 243.998 k: 0    dim : 0
+normNR : 0.208441       normR : 0.281245        k: 1    dim : 1
+normNR : 0.00179702     normR : 0.00254381      k: 2    dim : 2
+normNR : 1.22709e-05    normR : 1.7396e-05      k: 3    dim : 3
+normNR : 1.25328e-07    normR : 1.79491e-07     k: 4    dim : 4
+normNR : 1.00295e-09    normR : 1.49707e-09     k: 5    dim : 5
+normNR : 1.06377e-11    normR : 1.4495e-11      k: 6    dim : 6
+
+Krill : 17.326007000000001
+
+////// MinRelRelWithDebInfo //// changed to have -O1 ///////////
+
+(rb311) (base) s1104997ca:build rjmurr$ ./KRILL_simple 
+cols  : 59535
+rows  : 8
+k     : 1024
+s     : 1
+mu0   : 0.00059535
+rpc_b : 64
+
+Cholesky failed with exit code 40.
+Returning early, with approximation rank = 956
+
+normNR : 113.48 normR : 243.998 k: 0    dim : 0
+normNR : 0.000378058    normR : 0.000443417     k: 1    dim : 1
+normNR : 3.85744e-09    normR : 4.38216e-09     k: 2    dim : 2
+normNR : 3.71859e-14    normR : 4.14125e-14     k: 3    dim : 3
+
+Krill : 13.233637999999999
+
+
+
+
+////// Release //////////
+
+(rb311) (base) s1104997ca:build rjmurr$ ./KRILL_simple 
+cols  : 59535
+rows  : 8
+k     : 1024
+s     : 1
+mu0   : 0.00059535
+rpc_b : 64
+
+Cholesky failed with exit code 40.
+Returning early, with approximation rank = 956
+
+normNR : 113.48 normR : 243.998 k: 0    dim : 0
+normNR : 0.000378058    normR : 0.000443417     k: 1    dim : 1
+normNR : 3.85744e-09    normR : 4.38216e-09     k: 2    dim : 2
+normNR : 3.71859e-14    normR : 4.14125e-14     k: 3    dim : 3
+
+Krill : 37.660519000000001
+
+
+
+
+I've observed something unexpected. If I apply the compiler flags "-Wall -Wextra -pedantic -fsanitize=undefined" then the behavior of my program with -O3 matches the behavior of my program with -O1. If the program's behavior differs when I use these flags then I would expect the compiler to tell me 
\ No newline at end of file
diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt
index 61169d12..31ac0c5c 100644
--- a/test/CMakeLists.txt
+++ b/test/CMakeLists.txt
@@ -6,19 +6,24 @@ if (GTest_FOUND)
     set(tmp TRUE)
 
     set(RandLAPACK_test_srcs
+        moremats.hh
         comps/test_determiter.cc
-        comps/test_util.cc
         comps/test_orth.cc
         comps/test_qb.cc
         comps/test_preconditioners.cc
         comps/test_rf.cc
         comps/test_syrf.cc
+        comps/test_rpchol.cc
         drivers/test_rsvd.cc
         drivers/test_cqrrpt.cc
         drivers/test_cqrrp.cc
         drivers/test_revd2.cc
         drivers/test_hqrrp.cc
         drivers/test_rbki.cc
+        drivers/test_krillx.cc
+        misc/test_util.cc
+        misc/test_pdkernels.cc
+        misc/test_linops.cc
     )
     
     # Create non-CUDA test executable
diff --git a/test/comps/test_determiter.cc b/test/comps/test_determiter.cc
index 40d9fb74..89257621 100644
--- a/test/comps/test_determiter.cc
+++ b/test/comps/test_determiter.cc
@@ -1,13 +1,21 @@
 #include "RandLAPACK.hh"
 #include "rl_blaspp.hh"
+#include "../RandLAPACK/RandBLAS/test/comparison.hh"
 
 #include <RandBLAS.hh>
 #include <math.h>
 #include <gtest/gtest.h>
 
+template <typename T>
+std::vector<T> eye(int64_t n) {
+    std::vector<T> A(n * n, 0.0);
+    for (int i = 0; i < n; ++i)
+        A[i + n*i] = 1.0;
+    return A;
+}
+
 
-class TestDetermiterOLS : public ::testing::Test
-{
+class TestDetermiterOLS : public ::testing::Test {
     protected:
         int64_t m = 201;
         int64_t n = 12;
@@ -17,13 +25,14 @@ class TestDetermiterOLS : public ::testing::Test
 
     virtual void TearDown() {};
 
-    virtual void run(uint64_t key_index)
-    {   
+    virtual void run(uint64_t key_index) {   
+       
         std::vector<double> A(m * n);
-        RandBLAS::util::genmat(m, n, A.data(), keys[key_index]);
+        RandBLAS::RNGState state0(keys[key_index]);
+        auto state1 = RandBLAS::fill_dense({m, n}, A.data(), state0);
         
         std::vector<double> b(m);
-        RandBLAS::util::genmat(m, 1, b.data(), keys[key_index] + (uint64_t) 1);
+        RandBLAS::fill_dense({m, 1}, b.data(), state1);
         
         std::vector<double> c(n, 0.0);
         std::vector<double> x0(n, 0.0);
@@ -39,9 +48,10 @@ class TestDetermiterOLS : public ::testing::Test
         double delta = 0.1;
 	    double tol = 1e-8;
 
-        RandLAPACK::pcg(
+        RandLAPACK::pcg_saddle(
             m, n, A.data(), m, b.data(), c.data(), delta,
-            resid_vec, tol, n, M.data(), n, x0.data(), x.data(), y.data());
+            resid_vec, tol, n, M.data(), n, x0.data(), x.data(), y.data()
+        );
         
 
         int64_t iter_count = 0;
@@ -64,3 +74,129 @@ TEST_F(TestDetermiterOLS, Trivial) {
         run(k_idx);
     }
 }
+
+
+class TestDetermiterLockBlockPCG : public ::testing::Test {
+    protected:
+    
+    virtual void SetUp() {};
+
+    virtual void TearDown() {};
+
+    template <typename T>
+    void run_simple_block(int64_t m, int64_t s, T coeff, uint32_t seed) {   
+        using std::vector;
+        auto layout = blas::Layout::ColMajor;
+        vector<T> G_buff(m*m, 0.0);
+        vector<T> H(m*s, 0.0);
+        randblas_require((int64_t) H.size() == m*s);
+        vector<T> X_star(m*s, 0.0);
+        vector<T> X_init(m*s, 0.0);
+        RandBLAS::RNGState state0(seed);
+        vector<T> temp(2*m*m);
+        auto D = RandBLAS::DenseDist {2*m, m, RandBLAS::ScalarDist::Gaussian};
+        auto state1 = RandBLAS::fill_dense(D, temp.data(), state0);
+        blas::syrk(layout, blas::Uplo::Upper, blas::Op::Trans, m, 2*m, 1.0, temp.data(), 2*m, 0.0, G_buff.data(), m);
+
+        vector<T> regs(1, coeff);
+        RandLAPACK::linops::RegExplicitSymLinOp G(m, G_buff.data(), m, regs);
+        RandBLAS::DenseDist DX_star {m, s, RandBLAS::ScalarDist::Gaussian};
+        auto Xsd = X_star.data();
+        auto state2 = RandBLAS::fill_dense(DX_star, Xsd, state1);
+        G(layout, s, 1.0, X_star.data(), m, 0.0, H.data(), m);
+
+        RandLAPACK::StatefulFrobeniusNorm<T> seminorm{};
+
+        auto I_buff = eye<T>(m);
+        vector<T> zeros(1, 0.0);
+        RandLAPACK::linops::RegExplicitSymLinOp I(m, I_buff.data(), m, zeros);
+
+        T tol = 100*std::numeric_limits<T>::epsilon();
+        RandLAPACK::lockorblock_pcg(G, H, tol, m, I, seminorm, X_init, true);
+
+        T tol_scale = std::sqrt((T)m);
+        T atol = tol_scale * std::pow(std::numeric_limits<T>::epsilon(), 0.5);
+        T rtol = tol_scale * atol;
+        test::comparison::buffs_approx_equal(X_init.data(), X_star.data(), m * s,
+            __PRETTY_FUNCTION__, __FILE__, __LINE__, atol, rtol
+        );
+        return;
+    }
+
+    virtual void run_simple_lockstep(int64_t m, int64_t s, uint32_t seed) {  
+        using T = double;
+        randblas_require(s <= 4);
+        using std::vector;
+        vector<T> reg_coeffs{};
+        reg_coeffs.push_back(100);
+        if (s > 1)
+            reg_coeffs.push_back(7);
+        if (s > 2)
+            reg_coeffs.push_back(0.1);
+        if (s > 3)
+            reg_coeffs.push_back(0.5483);
+        auto layout = blas::Layout::ColMajor;
+        vector<T> G_buff(m*m, 0.0);
+        vector<T> H(m*s, 0.0);
+        vector<T> X_star(m*s, 0.0);
+        vector<T> X_init(m*s, 0.0);
+        RandBLAS::RNGState state0(seed);
+        vector<T> temp(2*m*m);
+        
+        auto D = RandBLAS::DenseDist {2*m, m, RandBLAS::ScalarDist::Gaussian};
+        auto state1 = RandBLAS::fill_dense(D, temp.data(), state0);
+        blas::syrk(layout, blas::Uplo::Upper, blas::Op::Trans, m, 2*m, 1.0, temp.data(), 2*m, 0.0, G_buff.data(), m);
+
+        vector<T> regs(reg_coeffs);
+        RandLAPACK::linops::RegExplicitSymLinOp G(m, G_buff.data(), m, regs);
+        RandBLAS::DenseDist DX_star {m, s, RandBLAS::ScalarDist::Gaussian};
+        auto Xsd = X_star.data();
+        auto state2 = RandBLAS::fill_dense(DX_star, Xsd, state1);
+        G(layout, s, 1.0, X_star.data(), m, 0.0, H.data(), m);
+
+        RandLAPACK::StatefulFrobeniusNorm<T> seminorm{};
+
+        auto I_buff = eye<T>(m);
+        vector<T> zeros(s, 0.0);
+        RandLAPACK::linops::RegExplicitSymLinOp I(m, I_buff.data(), m, zeros);
+
+        T tol = 100*std::numeric_limits<T>::epsilon();
+        RandLAPACK::lockorblock_pcg(G, H, tol, m, I, seminorm, X_init, true);
+
+        T tol_scale = std::sqrt((T)m);
+        T atol = tol_scale * std::pow(std::numeric_limits<T>::epsilon(), 0.5);
+        T rtol = tol_scale * atol;
+        test::comparison::buffs_approx_equal(X_init.data(), X_star.data(), m * s,
+            __PRETTY_FUNCTION__, __FILE__, __LINE__, atol, rtol
+        );
+        return;
+    }
+};
+
+
+TEST_F(TestDetermiterLockBlockPCG, test_run_simple_block_5_1) {
+    run_simple_block<double>(5, 1, 0.5, 1997);
+}
+
+TEST_F(TestDetermiterLockBlockPCG, test_run_simple_block_6_2) {
+    run_simple_block<double>(6, 2, 0.5, 1997);
+}
+
+TEST_F(TestDetermiterLockBlockPCG, test_run_simple_block_5_4) {
+    run_simple_block<double>(5, 4, 0.5, 1997);
+}
+
+TEST_F(TestDetermiterLockBlockPCG, test_run_simple_lockstep_5_1) {
+    run_simple_lockstep(5, 1, 1997);
+    run_simple_lockstep(5, 1, 2024);
+}
+
+TEST_F(TestDetermiterLockBlockPCG, test_run_simple_lockstep_6_2) {
+    run_simple_lockstep(6, 2, 1997);
+    run_simple_lockstep(6, 2, 2024);
+}
+
+TEST_F(TestDetermiterLockBlockPCG, test_run_simple_lockstep_5_4) {
+    run_simple_lockstep(5, 4, 1997);
+    run_simple_lockstep(5, 4, 2024);
+}
diff --git a/test/comps/test_orth.cc b/test/comps/test_orth.cc
index a31e9690..1034f6f2 100644
--- a/test/comps/test_orth.cc
+++ b/test/comps/test_orth.cc
@@ -59,7 +59,7 @@ class TestOrth : public ::testing::Test
 
         // Fill the gaussian random matrix
         RandBLAS::DenseDist D(n, k);
-        state = RandBLAS::fill_dense(D, all_data.Omega.data(), state).second;
+        state = RandBLAS::fill_dense(D, all_data.Omega.data(), state);
         
         // Generate a reference identity
         RandLAPACK::util::eye(k, k, all_data.I_ref);
diff --git a/test/comps/test_pcgls.cc b/test/comps/test_pcgls.cc
index 9181d932..ff1e46c7 100644
--- a/test/comps/test_pcgls.cc
+++ b/test/comps/test_pcgls.cc
@@ -29,7 +29,7 @@ void run_pcgls_ex(int n, int m)
     double delta = 0.1;
     double tol = 1e-8;
 
-    RandLAPACK::pcg(m, n, A.data(), m, b.data(), c.data(), delta,
+    RandLAPACK::pcg_saddle(m, n, A.data(), m, b.data(), c.data(), delta,
         resid_vec, tol, n, M.data(), n, x0.data(), x.data(), y.data());
 
     for (double res: resid_vec)
diff --git a/test/comps/test_preconditioners.cc b/test/comps/test_preconditioners.cc
index 3b7c3fc6..3b34e6f5 100644
--- a/test/comps/test_preconditioners.cc
+++ b/test/comps/test_preconditioners.cc
@@ -5,31 +5,43 @@
 #include <math.h>
 #include <lapack.hh>
 
+#include "../moremats.hh"
+#include "../RandLAPACK/RandBLAS/test/comparison.hh"
+
+
+using std::vector;
+using blas::Layout;
+using blas::Op;
+using RandBLAS::DenseDist;
+using RandBLAS::SparseDist;
+using RandBLAS::RNGState;
+using RandLAPACK_Testing::polynomial_decay_psd;
+
 
 template <typename T>
 void check_condnum_after_precond(
-    blas::Layout layout,
-    std::vector<T> &A,
-    std::vector<T> &M_wk,
+    Layout layout,
+    vector<T> &A,
+    vector<T> &M_wk,
     int64_t rank,
     int64_t m,
     int64_t n
 ) {
-    std::vector<T> A_pc(m * rank, 0.0);
-    bool is_colmajor = layout == blas::Layout::ColMajor;
+    vector<T> A_pc(m * rank, 0.0);
+    bool is_colmajor = layout == Layout::ColMajor;
     int64_t lda = (is_colmajor) ? m : n;
     int64_t ldm = (is_colmajor) ? n : rank;
     int64_t ldapc = (is_colmajor) ? m : rank;
     blas::gemm(
         layout,
-        blas::Op::NoTrans,
-        blas::Op::NoTrans,
+        Op::NoTrans,
+        Op::NoTrans,
         m, rank, n,
         1.0, A.data(), lda, M_wk.data(), ldm,
         0.0, A_pc.data(), ldapc
     );
     
-    std::vector<T> s(rank, 0.0);
+    vector<T> s(rank, 0.0);
     if (is_colmajor) {
         lapack::gesvd(lapack::Job::NoVec, lapack::Job::NoVec,
             m, rank, A_pc.data(), ldapc, s.data(), nullptr, 1, nullptr, 1
@@ -51,7 +63,7 @@ class Test_rpc_svd : public ::testing::Test
         static inline int64_t m = 500;
         static inline int64_t n = 10;
         static inline int64_t d = 30;
-        static inline std::vector<uint64_t> keys = {42, 1};
+        static inline vector<uint64_t> keys = {42, 1};
         static inline double sqrt_cond = 1e5;
         static inline double mu = 1e-6; // only used in "full_rank_after_reg" test.  
     
@@ -72,16 +84,16 @@ class Test_rpc_svd : public ::testing::Test
     template <typename T>
     void test_full_rank_without_reg(
         int key_index,
-        blas::Layout layout
+        Layout layout
     ){  
         // construct "A" with cond(A) >= sqrt_cond^2.
-        std::vector<T> A(m*n, 0.0);
+        vector<T> A(m*n, 0.0);
         T *a = A.data();
-        RandBLAS::DenseDist D(m, n, RandBLAS::DenseDistName::Uniform);
-        auto state = RandBLAS::RNGState(99);
+        DenseDist D(m, n, RandBLAS::ScalarDist::Uniform);
+        auto state = RNGState(99);
         RandBLAS::fill_dense(D, a, state);
     
-        if (layout == blas::Layout::RowMajor) {
+        if (layout == Layout::RowMajor) {
             // scale first row up by sqrt_cond
             // scale second row down by sqrt_cond
             blas::scal(n, sqrt_cond, a, 1);
@@ -96,11 +108,11 @@ class Test_rpc_svd : public ::testing::Test
         }
 
         // apply the function under test (rpc_data_svd_saso)
-        auto alg_state = RandBLAS::RNGState((uint32_t) keys[key_index]);
-        std::vector<T> M_wk(d*n, 0.0);
-        std::vector<T> sigma_sk(n, 0.0);
-        int64_t lda = (layout == blas::Layout::ColMajor) ? m : n;
-        RandBLAS::SparseDist SDist{.n_rows=d, .n_cols=m, .vec_nnz=8};
+        auto alg_state = RNGState((uint32_t) keys[key_index]);
+        vector<T> M_wk(d*n, 0.0);
+        vector<T> sigma_sk(n, 0.0);
+        int64_t lda = (layout == Layout::ColMajor) ? m : n;
+        SparseDist SDist(d, m, 8, RandBLAS::Axis::Short);
         RandBLAS::SparseSkOp<T> S(SDist, alg_state);
         RandBLAS::fill_sparse(S);
         
@@ -132,10 +144,10 @@ class Test_rpc_svd : public ::testing::Test
     ){    
         // construct an ill-conditioned matrix, then zero out first column.
         // After regularization the augmented matrix will still be full-rank.
-        std::vector<double> A(m*n, 0.0);
+        vector<double> A(m*n, 0.0);
         double *a = A.data();
-        RandBLAS::DenseDist D(m, n, RandBLAS::DenseDistName::Uniform);
-        auto state = RandBLAS::RNGState(99);
+        DenseDist D(m, n, RandBLAS::ScalarDist::Uniform);
+        auto state = RNGState(99);
         RandBLAS::fill_dense(D, a, state);
                       
         blas::scal(n, sqrt_cond, a, 1);
@@ -144,20 +156,20 @@ class Test_rpc_svd : public ::testing::Test
         blas::scal(m, 0.0, a, n);
 
         // apply the function under test (rpc_svd_saso)
-        std::vector<double> M_wk(d*n, 0.0);
-        std::vector<double> sigma_sk(n, 0.0);
-        auto alg_state = RandBLAS::RNGState(keys[key_index]);
+        vector<double> M_wk(d*n, 0.0);
+        vector<double> sigma_sk(n, 0.0);
+        auto alg_state = RNGState(keys[key_index]);
         RandLAPACK::rpc_data_svd_saso(
-            blas::Layout::RowMajor, m, n, d, 8,
+            Layout::RowMajor, m, n, d, 8,
             A.data(), n, M_wk.data(), sigma_sk.data(), alg_state
         );
         int64_t rank = RandLAPACK::make_right_orthogonalizer(
-            blas::Layout::RowMajor,
+            Layout::RowMajor,
             n, M_wk.data(), sigma_sk.data(), mu
         );
         EXPECT_EQ(rank, n);
         
-        std::vector<double> A_aug((m + n)*n, 0.0);
+        vector<double> A_aug((m + n)*n, 0.0);
         double *a_aug = A_aug.data();
         blas::copy(m*n, a, 1, a_aug, 1);
         double sqrt_mu = std::sqrt(mu);
@@ -165,20 +177,20 @@ class Test_rpc_svd : public ::testing::Test
         for (int i = 0; i < n; ++i)
             sqrt_mu_eye[n*i + i] = sqrt_mu;
 
-        check_condnum_after_precond(blas::Layout::RowMajor, A_aug, M_wk, rank, m + n, n);
+        check_condnum_after_precond(Layout::RowMajor, A_aug, M_wk, rank, m + n, n);
     }
 };
 
 TEST_F(Test_rpc_svd, FullRankNoReg_rowmajor_double)
 {
-    test_full_rank_without_reg<double>(0, blas::Layout::RowMajor);
-    test_full_rank_without_reg<double>(1, blas::Layout::RowMajor);
+    test_full_rank_without_reg<double>(0, Layout::RowMajor);
+    test_full_rank_without_reg<double>(1, Layout::RowMajor);
 }
 
 TEST_F(Test_rpc_svd, FullRankNoReg_colmajor_double)
 {
-    test_full_rank_without_reg<double>(0, blas::Layout::ColMajor);
-    test_full_rank_without_reg<double>(1, blas::Layout::ColMajor);
+    test_full_rank_without_reg<double>(0, Layout::ColMajor);
+    test_full_rank_without_reg<double>(1, Layout::ColMajor);
 }
 
 TEST_F(Test_rpc_svd, FullRankAfterReg)
@@ -187,51 +199,27 @@ TEST_F(Test_rpc_svd, FullRankAfterReg)
     test_full_rank_after_reg(1);
 }
 
-class TestNystromPrecond : public ::testing::Test
-{
+
+/***
+ * This actually assesses quality of the Nystrom preconditioner.
+ */
+class TestNystromPrecond : public ::testing::Test {
 
     protected:
         static inline int64_t m = 500;
-        static inline std::vector<uint64_t> keys = {42, 1};
+        static inline vector<uint32_t> keys = {42, 1};
     
     virtual void SetUp() {};
 
     virtual void TearDown() {};
 
     template <typename T>
-    void set_invP(int64_t m, int64_t k, T* V, T* lambda, T mu, T* invP) {
-        // compute invP = V * diag((min(lambda) + mu)/(lambda + mu)) * V' + (I - VV').
-        RandLAPACK::util::eye(m, m, invP);
-        blas::gemm(blas::Layout::ColMajor, blas::Op::NoTrans, blas::Op::Trans,
-            m, m, k, -1.0, V, m, V, m, 1.0, invP, m
-        );
-        for (int i = 0; i < k; ++i) {
-            blas::gemm(blas::Layout::ColMajor, blas::Op::NoTrans, blas::Op::Trans,
-                m, m, 1, (lambda[k-1] + mu) / (lambda[i] + mu),
-                &V[i*m], m, &V[i*m], m, 1.0, invP, m 
-            );
-        }
-    };
-
-    template <typename T>
-    void set_G_mu_pre(int64_t m, T* G, T mu, T* invP, T* G_mu_pre) {
-        // G_mu_pre = (G + mu)*invP
-        blas::copy(m * m, invP, 1, G_mu_pre, 1);
-        blas::scal(m * m, mu, G_mu_pre, 1);
-        blas::symm(blas::Layout::ColMajor, blas::Side::Left, blas::Uplo::Lower,
-            m, m, 1.0, G, m, invP, m, 1.0, G_mu_pre, m
-        );
-        for(int i = 1; i < m; ++i)
-            blas::copy(m - i, &G_mu_pre[i + ((i-1) * m)], 1, &G_mu_pre[(i - 1) + (i * m)], m);
-    };
-
-    template <typename T>
-    void run(int key_index, std::vector<T> &G) {
+    void run(int key_index, vector<T> &G) {
         /* Run the algorithm under test */
-        RandBLAS::RNGState alg_state(keys[key_index]);
+        RNGState alg_state(keys[key_index]);
         alg_state.key.incr();
-        std::vector<T> V(0);
-        std::vector<T> lambda(0);
+        vector<T> V(0);
+        vector<T> lambda(0);
         int64_t k = 1;
         T mu_min = 1e-5;
         RandLAPACK::nystrom_pc_data(
@@ -239,49 +227,53 @@ class TestNystromPrecond : public ::testing::Test
         ); // k has been updated.
 
         /* Verify algorithm output */
-        EXPECT_TRUE(k > 5);
+        EXPECT_TRUE(k > 2);
         EXPECT_TRUE(k < m);
-        std::vector<T> invP(m * m, 0.0);
-        std::vector<T> G_mu_pre(m * m, 0.0);
+        RandLAPACK::linops::SpectralPrecond<T> invP(m);
+        vector<T> G_mu_pre(m * m, 0.0);
+        vector<T> G_mu(m * m);
+        vector<T> mus(1);
+        vector<T> s(m);
 
+        mus[0] = mu_min;
+        G_mu = G;
+        for (int64_t i = 0; i < m; ++i)
+            G_mu[i + i*m] += mus[0];
+        invP.prep(V, lambda, mus, m);
+        invP.evaluate(m, G_mu.data(), G_mu_pre.data());
         T cond_lim = 5;
-        T mu = mu_min;
-        std::vector<T> s(m);
-        set_invP(m, k, V.data(), lambda.data(), mu, invP.data());
-        set_G_mu_pre(m, G.data(), mu, invP.data(), G_mu_pre.data());
         lapack::gesvd(lapack::Job::NoVec, lapack::Job::NoVec, m, m, G_mu_pre.data(), m, s.data(), nullptr, 1, nullptr, 1);
         T cond = s[0] / s[m-1];
         EXPECT_LE(cond, cond_lim);
 
-        mu *= 10;
+        mus[0] *= 10;
+        G_mu = G;
+        for (int64_t i = 0; i < m; ++i)
+            G_mu[i + i*m] += mus[0];
+        invP.prep(V, lambda, mus, m);
+        invP.evaluate(m, G_mu.data(), G_mu_pre.data());
         cond_lim /= 2;
-        set_invP(m, k, V.data(), lambda.data(), mu, invP.data());
-        set_G_mu_pre(m, G.data(), mu, invP.data(), G_mu_pre.data());
         lapack::gesvd(lapack::Job::NoVec, lapack::Job::NoVec, m, m, G_mu_pre.data(), m, s.data(), nullptr, 1, nullptr, 1);
         cond = s[0] / s[m-1];
         EXPECT_LE(cond, cond_lim);
     
-        mu *= 10;
+        mus[0] *= 10;
+        G_mu = G;
+        for (int64_t i = 0; i < m; ++i)
+            G_mu[i + i*m] += mus[0];
+        invP.prep(V, lambda, mus, m);
+        invP.evaluate(m, G_mu.data(), G_mu_pre.data());
         cond_lim /= 2;
-        set_invP(m, k, V.data(), lambda.data(), mu, invP.data());
-        set_G_mu_pre(m, G.data(), mu, invP.data(), G_mu_pre.data());
         lapack::gesvd(lapack::Job::NoVec, lapack::Job::NoVec, m, m, G_mu_pre.data(), m, s.data(), nullptr, 1, nullptr, 1);
         cond = s[0] / s[m-1];
         EXPECT_LE(cond, cond_lim);
-    };
+    }
 };
 
+
 TEST_F(TestNystromPrecond, basictest) {
-    RandLAPACK::gen::mat_gen_info<double> mat_info(m, m, RandLAPACK::gen::polynomial);
-    mat_info.cond_num = 1e6;
-    mat_info.rank = m;
-    mat_info.exponent = 2.0;
-    std::vector<double> A(m * m, 0.0);
-    RandBLAS::RNGState data_state(0);
-    RandLAPACK::gen::mat_gen(mat_info, A.data(), data_state);
-    std::vector<double> G(m * m, 0.0);
-    blas::syrk(Layout::ColMajor, Uplo::Lower, Op::NoTrans, m, m, 1.0,
-        A.data(), m, 0.0, G.data(), m
-    ); // Note: G is PSD with squared spectrum of A.
-    run<double>(0, G);
+    auto G = polynomial_decay_psd<double>(m, 1e12, 2.0, 99);
+    run(0, G);
+    run(1, G);
 }
+
diff --git a/test/comps/test_rpchol.cc b/test/comps/test_rpchol.cc
new file mode 100644
index 00000000..03c281d2
--- /dev/null
+++ b/test/comps/test_rpchol.cc
@@ -0,0 +1,173 @@
+#include "RandLAPACK.hh"
+#include "rl_rpchol.hh"
+#include "rl_blaspp.hh"
+#include "rl_gen.hh"
+#include "../RandLAPACK/RandBLAS/test/comparison.hh"
+
+#include <RandBLAS.hh>
+#include <math.h>
+#include <gtest/gtest.h>
+
+// template <typename T>
+// std::vector<T> eye(int64_t n) {
+//     std::vector<T> A(n * n, 0.0);
+//     for (int i = 0; i < n; ++i)
+//         A[i + n*i] = 1.0;
+//     return A;
+// }
+
+using RandBLAS::RNGState;
+
+template <typename T, typename RNG>
+RNGState<RNG> left_multiply_by_orthmat(int64_t m, int64_t n, std::vector<T> &A, RNGState<RNG> state) {
+    using std::vector;
+    vector<T> U(m * m, 0.0);
+    RandBLAS::DenseDist DU(m, m);
+    auto out_state = RandBLAS::fill_dense(DU, U.data(), state);
+    vector<T> tau(m, 0.0);
+    lapack::geqrf(m, m, U.data(), m, tau.data());
+    lapack::ormqr(blas::Side::Left, blas::Op::NoTrans, m, n, m, U.data(), m, tau.data(), A.data(), m);
+    return out_state;
+}
+
+template <typename T>
+void full_gram(int64_t n, std::vector<T> &A, blas::Op op, int64_t k = -1) {
+    std::vector<T> work(A);
+    auto uplo   = blas::Uplo::Upper;
+    auto layout = blas::Layout::ColMajor;
+    if (k == -1) {
+        k = n;
+    } else {
+        randblas_require(op == blas::Op::NoTrans);
+    }
+    blas::syrk(layout, uplo, op, n, k, 1.0, work.data(), n, 0.0, A.data(), n); 
+    RandBLAS::symmetrize(layout, uplo, n, A.data(), n);
+}
+
+class TestRPCholesky : public ::testing::Test {
+    protected:
+    
+    virtual void SetUp() {};
+
+    virtual void TearDown() {};
+
+    template <typename T, typename FUNC>
+    void run_exact(int64_t n, FUNC &A, T* Abuff, int64_t b, T atol, T rtol, uint32_t seed) {
+        using std::vector;
+
+        int64_t k = n;
+        vector<T> F(n*k, 0.0);
+        vector<int64_t> selection(k, -1);
+        RandBLAS::RNGState state_in(seed);
+        auto state_out = RandLAPACK::rp_cholesky(n, A, k, selection.data(), F.data(), b, state_in);
+
+        vector<T> Arecovered(F);
+        full_gram(n, Arecovered, blas::Op::NoTrans, k);
+        test::comparison::matrices_approx_equal(
+            blas::Layout::ColMajor, blas::Op::NoTrans, n, n, Abuff, n, Arecovered.data(), n, __PRETTY_FUNCTION__, __FILE__, __LINE__,
+            atol, rtol
+        );
+        // Check that the pivots are reasonable and nontrivial (i.e., not the sequence from 0 to n-1).
+        std::set<int64_t> selection_unique{};
+        for (auto pivot : selection) {
+            if (pivot != -1)
+                selection_unique.insert(pivot);
+        }
+        ASSERT_EQ(selection_unique.size(), k) << "using seed " << seed;
+        if (n > 4)
+            ASSERT_FALSE(std::is_sorted(selection.begin(), selection.end())) <<  "using seed " << seed;
+        // ^ is_sorted() checks if we're in increasing order
+        return;
+    }
+
+    template <typename T>
+    void run_exact_diag(int64_t n, int64_t b, int64_t power, uint32_t seed) {  
+        std::vector<T> Avec(n * n, 0.0);
+        for (int64_t i = 0; i < n; ++i)
+            Avec[i + n*i] = std::pow((T) i + 1, power);
+        auto Abuff = Avec.data();
+        auto A = [Abuff, n](int64_t i, int64_t j) { return Abuff[i + n*j]; };
+
+        T atol = std::sqrt(n) * std::numeric_limits<T>::epsilon();
+        T rtol = std::sqrt(n) * std::numeric_limits<T>::epsilon();
+        run_exact(n, A, Abuff, b, atol, rtol, seed);
+        return;
+    }
+
+    template <typename T>
+    void run_exact_kahan_gram(int64_t n, int64_t b, uint32_t seed) {
+        using std::vector;
+        vector<T> Avec(n * n, 0.0);
+        T theta = 1.2;
+        T perturb = 10;
+        RandLAPACK::gen::gen_kahan_mat(n, n, Avec.data(), theta, perturb);
+        vector<T> kahan(Avec);
+        full_gram(n, Avec, blas::Op::Trans);
+        // ^ Avec now represents the Gram matrix of the Kahan matrix.
+
+        std::vector<T> gk_chol(Avec); 
+        // ^ We'll run Cholesky on the Gram matrix of the Kahan matrix,
+        //   and compare to the Kahan matrix itself. This helps us get
+        //   a realistic tolerance considering the numerical nastyness
+        //   of the Kahan matrix.
+        auto status = lapack::potrf(blas::Uplo::Upper, n, gk_chol.data(), n);
+        randblas_require(status == 0);
+        T atol = 0.0;
+        RandLAPACK::util::get_U(n, n, gk_chol.data(), n);
+        for (int64_t i = 0; i < n*n; ++i) {
+            T val1 = std::abs(kahan[i] - gk_chol[i]);
+            T val2 = std::abs(kahan[i] + gk_chol[i]);
+            atol = std::max(atol, std::min(val1, val2));
+        }
+        atol = std::sqrt(n) * atol;
+
+        T* Abuff = Avec.data();
+        auto A = [Abuff, n](int64_t i, int64_t j) { return Abuff[i + n*j]; };
+        run_exact(n, A, Abuff, b, atol, atol, seed);
+        // ^ use the same value for rtol and atol
+        return;
+    }
+};
+
+
+TEST_F(TestRPCholesky, test_exact_diag_b1) {
+    for (uint32_t i = 2012; i < 2019; ++i) {
+        run_exact_diag<float>(5,   1, 2, i);
+        run_exact_diag<float>(10,  1, 1, i);
+        run_exact_diag<float>(10,  1, 2, i);
+        run_exact_diag<float>(13,  1, 2, i);
+        run_exact_diag<float>(100, 1, 2, i);
+    }
+}
+
+TEST_F(TestRPCholesky, test_exact_diag_b2) {
+    for (uint32_t i = 2012; i < 2019; ++i) {
+        run_exact_diag<float>(10,  2, 1, i);
+        run_exact_diag<float>(10,  2, 2, i);
+        run_exact_diag<float>(100, 2, 2, i);
+    }
+}
+
+TEST_F(TestRPCholesky, test_exact_kahan_gram_b1) {
+    for (uint32_t i = 2012; i < 2019; ++i) {
+        run_exact_kahan_gram<float>(5,  1, i);
+        run_exact_kahan_gram<float>(10, 1, i);
+    }
+}
+
+TEST_F(TestRPCholesky, test_exact_kahan_gram_b2) {
+    for (uint32_t i = 2012; i < 2019; ++i) {
+        run_exact_kahan_gram<float>(10,  2, i);
+        run_exact_kahan_gram<float>(11,  2, i);
+        run_exact_kahan_gram<float>(12,  2, i);
+    }
+}
+
+TEST_F(TestRPCholesky, test_exact_kahan_gram_b3) {
+    for (uint32_t i = 2012; i < 2019; ++i) {
+        run_exact_kahan_gram<float>(9,   3, i);
+        run_exact_kahan_gram<float>(10,  3, i);
+        run_exact_kahan_gram<float>(11,  3, i);
+        run_exact_kahan_gram<float>(12,  3, i);
+    }
+}
diff --git a/test/drivers/test_cqrrp.cc b/test/drivers/test_cqrrp.cc
index e724f600..f10b24e2 100644
--- a/test/drivers/test_cqrrp.cc
+++ b/test/drivers/test_cqrrp.cc
@@ -1,3 +1,4 @@
+#if !defined(__APPLE__)
 #include "RandLAPACK.hh"
 #include "rl_blaspp.hh"
 #include "rl_lapackpp.hh"
@@ -146,7 +147,6 @@ class TestCQRRP : public ::testing::Test
 
 };
 
-#if !defined(__APPLE__)
 // Note: If Subprocess killed exception -> reload vscode
 TEST_F(TestCQRRP, CQRRP_blocked_full_rank_basic) {
     int64_t m = 5000;//5000;
diff --git a/test/drivers/test_hqrrp.cc b/test/drivers/test_hqrrp.cc
index ffa09f2b..5ade37e0 100644
--- a/test/drivers/test_hqrrp.cc
+++ b/test/drivers/test_hqrrp.cc
@@ -7,7 +7,6 @@
 #include <fstream>
 #include <gtest/gtest.h>
 
-
 class TestHQRRP : public ::testing::Test
 {
     protected:
diff --git a/test/drivers/test_krillx.cc b/test/drivers/test_krillx.cc
new file mode 100644
index 00000000..77247c75
--- /dev/null
+++ b/test/drivers/test_krillx.cc
@@ -0,0 +1,189 @@
+#include <blas.hh>
+#include <RandBLAS.hh>
+#include <RandLAPACK.hh>
+#include <gtest/gtest.h>
+#include <math.h>
+#include <lapack.hh>
+
+#include "../moremats.hh"
+#include "../RandLAPACK/RandBLAS/test/comparison.hh"
+
+
+using std::vector;
+using blas::Layout;
+using blas::Op;
+using RandBLAS::DenseDist;
+using RandBLAS::SparseDist;
+using RandBLAS::RNGState;
+using RandLAPACK::linops::RegExplicitSymLinOp;
+using RandLAPACK::linops::RBFKernelMatrix;
+using RandLAPACK_Testing::polynomial_decay_psd;
+
+
+class TestKrillIsh: public ::testing::Test {
+
+    protected:
+        static inline int64_t m = 1000;
+        static inline vector<uint32_t> keys = {42, 1};
+    
+    virtual void SetUp() {};
+
+    virtual void TearDown() {};
+
+    template <typename T>
+    void run_common(T mu_min, vector<T> &V, vector<T> &lambda, RegExplicitSymLinOp<T> &G_linop) {
+        RandLAPACK::linops::SpectralPrecond<T> invP(m);
+        vector<T> mus {mu_min, mu_min/10, mu_min/100};
+        G_linop.regs = mus;
+        G_linop.set_eval_includes_reg(true);
+        invP.prep(V, lambda, mus, mus.size());
+        int64_t s = mus.size();
+
+        vector<T> X_star(m*s, 0.0);
+        vector<T> X_init(m*s, 0.0);
+        vector<T> H(m*s, 0.0);
+        RNGState state0(101);
+        DenseDist DX_star {m, s, RandBLAS::ScalarDist::Gaussian};
+        auto Xsd = X_star.data();
+        auto state1 = RandBLAS::fill_dense(DX_star, Xsd, state0);
+        G_linop(blas::Layout::ColMajor, s, 1.0, X_star.data(), m, 0.0, H.data(), m);
+
+        std::cout << "\nFrobenius norm of optimal solution : " << blas::nrm2(m*s, X_star.data(), 1);
+        std::cout << "\nFrobenius norm of right-hand-side  : " << blas::nrm2(m*s, H.data(), 1) << std::endl;
+        RandLAPACK::StatefulFrobeniusNorm<T> seminorm{};
+        T tol = 100*std::numeric_limits<T>::epsilon();
+        int64_t max_iters = 30;
+        RandLAPACK::lockorblock_pcg(G_linop, H, tol, max_iters, invP, seminorm, X_init, true);
+
+        T tol_scale = std::sqrt((T)m);
+        T atol = tol_scale * std::pow(std::numeric_limits<T>::epsilon(), 0.5);
+        T rtol = tol_scale * atol;
+        test::comparison::buffs_approx_equal(X_init.data(), X_star.data(), m * s,
+            __PRETTY_FUNCTION__, __FILE__, __LINE__, atol, rtol
+        );
+        return;
+    }
+
+    template <typename T = double>
+    void run_nystrom(int key_index, vector<T> &G) {
+        /* Run the algorithm under test */
+        RNGState alg_state(keys[key_index]);
+        alg_state.key.incr();
+        vector<T> V(0);
+        vector<T> lambda(0);
+        int64_t k = 64;
+        T mu_min = 1e-5;
+        vector<T> regs{};
+        RegExplicitSymLinOp G_linop(m, G.data(), m, regs);
+        RandLAPACK::nystrom_pc_data(
+            G_linop, V, lambda, k, mu_min/10, alg_state
+        ); // k has been updated.
+        EXPECT_TRUE(k > 5);
+        EXPECT_TRUE(k < m);
+        run_common(mu_min, V, lambda, G_linop);
+    }
+
+    template <typename T = double>
+    void run_rpchol(int key_index, vector<T> &G) {
+        RNGState alg_state(keys[key_index]);
+        alg_state.key.incr();
+        int64_t k = 128;
+        vector<T> V(m*k);
+        vector<T> lambda(k);
+        T mu_min = 1e-5;
+        int64_t rp_chol_block_size = 4;
+        vector<T> regs{};
+        RegExplicitSymLinOp G_linop(m, G.data(), m, regs);
+        RandLAPACK::rpchol_pc_data(m, G_linop, k, rp_chol_block_size, V.data(), lambda.data(), alg_state);
+        EXPECT_TRUE(k == 128);
+        run_common(mu_min, V, lambda, G_linop);
+    }
+};
+
+TEST_F(TestKrillIsh, test_manual_lockstep_nystrom) {
+    for (int64_t decay = 2; decay < 4; ++decay) {
+        auto G = polynomial_decay_psd(m, 1e12, (double) decay, 99);
+        run_nystrom(0, G);
+        run_nystrom(1, G);
+    }
+}
+
+TEST_F(TestKrillIsh, test_manual_lockstep_rpchol) {
+    auto G = polynomial_decay_psd(m, 1e12, 2.0, 99);
+    run_rpchol(0, G);
+    run_rpchol(1, G);
+}
+
+
+class TestKrillx: public ::testing::Test {
+
+    protected:
+        static inline int64_t m = 1000;
+        static inline vector<uint32_t> keys = {42, 1};
+    
+    virtual void SetUp() {};
+
+    virtual void TearDown() {};
+
+    template <typename RELO>
+    void run_krill_separable(int key_index, RELO &G_linop, int64_t k) {
+        using T = typename RELO::scalar_t;
+        int64_t s = G_linop.regs.size();
+
+        vector<T> X_star(m*s, 0.0);
+        vector<T> X_init(m*s, 0.0);
+        vector<T> H(m*s, 0.0);
+        RNGState state0(101);
+        DenseDist DX_star {m, s, RandBLAS::ScalarDist::Gaussian};
+        auto Xsd = X_star.data();
+        auto state1 = RandBLAS::fill_dense(DX_star, Xsd, state0);
+        G_linop.set_eval_includes_reg(true);
+        G_linop(blas::Layout::ColMajor, s, 1.0, X_star.data(), m, 0.0, H.data(), m);
+        std::cout << "\nFrobenius norm of optimal solution : " << blas::nrm2(m*s, X_star.data(), 1);
+        std::cout << "\nFrobenius norm of right-hand-side  : " << blas::nrm2(m*s, H.data(), 1) << std::endl;
+
+        RandLAPACK::StatefulFrobeniusNorm<T> seminorm{};
+        T tol = 100*std::numeric_limits<T>::epsilon();
+        int64_t max_iters = 30;
+        int64_t rpc_blocksize = 16;
+        RNGState state2(keys[key_index]);
+        RandLAPACK::krill_full_rpchol(
+            m, G_linop, H, X_init, tol, state2, seminorm, rpc_blocksize, max_iters, k
+        );
+        T tol_scale = std::sqrt((T)m);
+        T atol = tol_scale * std::pow(std::numeric_limits<T>::epsilon(), 0.5);
+        T rtol = tol_scale * atol;
+        test::comparison::buffs_approx_equal(X_init.data(), X_star.data(), m * s,
+            __PRETTY_FUNCTION__, __FILE__, __LINE__, atol, rtol
+        );
+        return;
+    }
+};
+
+TEST_F(TestKrillx, test_krill_full_rpchol) {
+    using T = double;
+    T mu_min = 1e-5;
+    vector<T> mus {mu_min, mu_min/10, mu_min/100};
+    for (int64_t decay = 2; decay < 4; ++decay) {
+        auto G = polynomial_decay_psd(m, 1e12, (T) decay, 99);
+        RegExplicitSymLinOp G_linop(m, G.data(), m, mus);
+        int64_t k = 128;
+        run_krill_separable(0, G_linop, k);
+        run_krill_separable(1, G_linop, k);
+    }
+}
+
+TEST_F(TestKrillx, test_krill_separable_squared_exp_kernel) {
+    using T = double;
+    T mu_min = 1e-2;
+    vector<T> mus {mu_min, mu_min*10, mu_min*100};
+    for (uint32_t key = 0; key < 5; ++key) {
+        //auto G = polynomial_decay_psd(m, 1e12, (T) decay, key);
+        //RegExplicitSymLinOp G_linop(m, G.data(), m, mus);
+        vector<T> X0 = RandLAPACK_Testing::random_gaussian_mat<T>(5, m, key);
+        RBFKernelMatrix G_linop(m, X0.data(), 5, 3.0, mus);
+        int64_t k = 128;
+        run_krill_separable(0, G_linop, k);
+        run_krill_separable(1, G_linop, k);
+    }
+}
diff --git a/test/misc/test_linops.cc b/test/misc/test_linops.cc
new file mode 100644
index 00000000..af86ab85
--- /dev/null
+++ b/test/misc/test_linops.cc
@@ -0,0 +1,112 @@
+#include <blas.hh>
+#include <RandBLAS.hh>
+#include <RandLAPACK.hh>
+#include <gtest/gtest.h>
+#include <math.h>
+#include <lapack.hh>
+#include "../RandLAPACK/RandBLAS/test/comparison.hh"
+
+
+using std::vector;
+using blas::Layout;
+using blas::Op;
+using RandBLAS::DenseDist;
+using RandBLAS::SparseDist;
+using RandBLAS::RNGState;
+
+
+/**
+ * Note: a few implicit linear operators are tested implicitly (ha) in
+ * test_determiter.cc. It's important to have tests for these things
+ * since bugs in their implementation can be hard to track down. 
+ */
+
+
+class TestSpectralPrecondLinearOperator: public ::testing::Test {
+
+    protected:
+    
+    virtual void SetUp() {};
+
+    virtual void TearDown() {};
+
+    // Run on a diagonal matrix with an optimal rank-k preconditioner.
+    template <typename T>
+    void run_diag(int64_t n, int64_t k, T mu) {
+        int64_t i;
+        vector<T> alleigs(n);
+        vector<T> allV(n*n, 0.0);
+        for (i = 0; i < n; ++i) {
+            alleigs[i] = std::pow((T)i + (T)1.0, (T) -3.0);
+            allV[i + i*n] = 1.0;
+        }
+
+        vector<T> G_mu(n*n, 0.0);
+        for (i = 0; i < n; ++i) {
+            G_mu[i + i*n] = alleigs[i] + mu;
+        }
+
+        vector<T> pceigs(k);
+        vector<T> pcV(n*k, 0.0);
+        for (i = 0; i < k; ++i) {
+            pceigs[i] = alleigs[i];
+            pcV[i + i*n] = 1.0;
+        }
+        vector<T> G_mu_pre_expect(n*n, 0.0);
+        T scale_on_precond_subspace = alleigs[k-1] + mu;
+        for (i = 0; i < n; ++i) {
+            if (i < k) {
+                G_mu_pre_expect[i + i*n] = scale_on_precond_subspace;
+            } else {
+                G_mu_pre_expect[i + i*n] = alleigs[i] + mu;
+            }
+        }
+        RandLAPACK::linops::SpectralPrecond<T> invP_operator(n);
+        vector<T> mus(1, mu);
+        invP_operator.prep(pcV, pceigs, mus, n);
+        vector<T> G_mu_pre_actual(n*n, 0.0);
+        invP_operator.evaluate(n, G_mu.data(), G_mu_pre_actual.data());
+        test::comparison::matrices_approx_equal(
+            Layout::ColMajor, Op::NoTrans, n, n, G_mu_pre_actual.data(), n,
+            G_mu_pre_expect.data(), n, __PRETTY_FUNCTION__, 
+            __FILE__, __LINE__
+        );
+        return;
+    }
+};
+
+TEST_F(TestSpectralPrecondLinearOperator, test_diag_n3_k1) {
+    run_diag<float>(3, 1, 0.1);
+}
+
+TEST_F(TestSpectralPrecondLinearOperator, test_diag_n3_k2) {
+    run_diag<float>(3, 2, 0.1);
+}
+
+TEST_F(TestSpectralPrecondLinearOperator, test_diag_n4_k1) {
+    run_diag<float>(4, 1, 0.1);
+}
+
+TEST_F(TestSpectralPrecondLinearOperator, test_diag_n4_k2) {
+    run_diag<float>(4, 2, 0.1);
+}
+
+TEST_F(TestSpectralPrecondLinearOperator, test_diag_n4_k3) {
+    run_diag<float>(4, 3, 0.1);
+}
+
+TEST_F(TestSpectralPrecondLinearOperator, test_diag_n5_k1) {
+    run_diag<float>(5, 1, 0.1);
+}
+
+TEST_F(TestSpectralPrecondLinearOperator, test_diag_n5_k2) {
+    run_diag<float>(5, 2, 0.1);
+}
+
+TEST_F(TestSpectralPrecondLinearOperator, test_diag_n5_k3) {
+    run_diag<float>(5, 3, 0.1);
+}
+
+TEST_F(TestSpectralPrecondLinearOperator, test_diag_n5_k4) {
+    run_diag<float>(5, 4, 0.1);
+}
diff --git a/test/misc/test_pdkernels.cc b/test/misc/test_pdkernels.cc
new file mode 100644
index 00000000..24b02f46
--- /dev/null
+++ b/test/misc/test_pdkernels.cc
@@ -0,0 +1,268 @@
+#include "RandLAPACK.hh"
+#include "rl_blaspp.hh"
+#include "rl_gen.hh"
+
+#include <RandBLAS.hh>
+#include "../RandLAPACK/RandBLAS/test/comparison.hh"
+#include "../moremats.hh"
+
+#include <math.h>
+#include <gtest/gtest.h>
+
+using RandBLAS::RNGState;
+using RandBLAS::DenseDist;
+using blas::Layout;
+using std::vector;
+
+class TestPDK_SquaredExponential : public ::testing::Test {
+    protected:
+    
+    virtual void SetUp() {};
+
+    virtual void TearDown() {};
+
+    /**
+     * Test that squared_exp_kernel_submatrix gives the same result
+     * as calls to squared_exp_kernel.
+     */
+    template <typename T>
+    void run_same_blockimpl_vs_entrywise(int64_t d, int64_t n, T bandwidth, uint32_t seed) {
+        vector<T> K_blockimpl(n*n, 0.0);
+        vector<T> K_entrywise(n*n, 0.0);
+        vector<T> X = RandLAPACK_Testing::random_gaussian_mat<T>(d, n, seed);
+        vector<T> squared_norms(n, 0.0);
+        T* X_ = X.data();
+        for (int64_t i = 0; i < n; ++i) {
+            squared_norms[i] = std::pow(blas::nrm2(d, X_ + i*d, 1), 2);
+        }
+        RandLAPACK::squared_exp_kernel_submatrix(
+            d, n, X_, squared_norms.data(), n, n, K_blockimpl.data(), 0, 0, bandwidth
+        );
+        for (int64_t j = 0; j < n; ++j) {
+            for (int64_t i = 0; i < n; ++i) {
+                T* xi = X.data() + i*d;
+                T* xj = X.data() + j*d;
+                K_entrywise[i + j*n] = RandLAPACK::squared_exp_kernel(d, xi, xj, bandwidth);
+            }
+        }
+        T atol = 3 * d * std::numeric_limits<T>::epsilon() * (1.0 + std::pow(bandwidth, -2));
+        test::comparison::matrices_approx_equal(
+            blas::Layout::ColMajor, blas::Op::NoTrans, n, n, K_blockimpl.data(), n,
+            K_entrywise.data(), n, __PRETTY_FUNCTION__, __FILE__, __LINE__, atol, atol
+        );
+        return;
+    }
+
+    /**
+     * Test that if all of X's columns are the same then the squared exponential kernel
+     * gives a matrix of all ones.
+     */
+    template <typename T>
+    void run_all_same_column(int64_t d, int64_t n, uint32_t seed) {
+        vector<T> c = RandLAPACK_Testing::random_gaussian_mat<T>(d, 1, seed);
+        vector<T> X(d*n, 0.0);
+        T* _X = X.data();
+        T* _c = c.data();
+        for (int64_t i = 0; i < n; ++i) {
+            blas::copy(d, _c, 1, _X + i*d, 1);
+        }
+        T sqnorm = std::pow(blas::nrm2(d, _c, 1), 2);
+        vector<T> squarednorms(n, sqnorm);
+        vector<T> K(n*n, 0.0);
+        T bandwidth = 2.3456;
+        RandLAPACK::squared_exp_kernel_submatrix(
+            d, n, _X, squarednorms.data(), n, n, K.data(), 0, 0, bandwidth
+        );
+        vector<T> expected(n*n, 1.0);
+        test::comparison::matrices_approx_equal(
+            blas::Layout::ColMajor, blas::Op::NoTrans, n, n, K.data(), n,
+            expected.data(), n, __PRETTY_FUNCTION__, __FILE__, __LINE__
+        );
+        return;
+    }
+
+    /**
+     * Test that if the columns of X are orthonormal then the diagonal
+     * will be all ones and the off-diagonal will be exp(-bandwidth^{-2});
+     * this needs to vary with different values for the bandwidth.
+     */
+    template <typename T>
+    void run_orthogonal(int64_t n, T bandwidth, uint32_t seed) {
+        std::vector<T> X(n*n, 0.0);
+        for (int64_t i = 0; i < n; ++i)
+            X[i+i*n] = 1.0;
+        RNGState state(seed);
+        RandLAPACK_Testing::left_multiply_by_orthmat(n, n, X, state);
+        vector<T> squarednorms(n, 1.0);
+        vector<T> K(n*n, 0.0);
+        RandLAPACK::squared_exp_kernel_submatrix(
+            n, n, X.data(), squarednorms.data(), n, n, K.data(), 0, 0, bandwidth
+        );
+        T offdiag = std::exp(-std::pow(bandwidth, -2));
+        std::vector<T> expect(n*n);
+        for (int64_t j = 0; j < n; ++j) {
+            for (int64_t i = 0; i < n; ++i) {
+                if (i == j) {
+                    expect[i+j*n] = 1.0;
+                } else {
+                    expect[i+j*n] = offdiag;
+                }
+            }
+        }
+        T atol = 50 * std::numeric_limits<T>::epsilon();
+        test::comparison::matrices_approx_equal(
+            blas::Layout::ColMajor, blas::Op::NoTrans, n, n, K.data(), n,
+            expect.data(), n,  __PRETTY_FUNCTION__, __FILE__, __LINE__, atol, atol
+        );
+        return;
+    }
+
+};
+
+TEST_F(TestPDK_SquaredExponential, test_repeated_columns) {
+    for (uint32_t i = 10; i < 15; ++i) {
+        run_all_same_column<float>(3, 9, i);
+        run_all_same_column<float>(9, 3, i);
+    }
+}
+
+
+TEST_F(TestPDK_SquaredExponential, test_blockimpl_vs_entrywise_full_matrix_d_3_n_10) {
+    for (uint32_t i = 2; i < 7; ++i) {
+        run_same_blockimpl_vs_entrywise<float>(3, 10, 1.0, i);
+        run_same_blockimpl_vs_entrywise<float>(3, 10, 0.2, i);
+        run_same_blockimpl_vs_entrywise<float>(3, 10, 5.9, i);
+    }
+}
+
+TEST_F(TestPDK_SquaredExponential, test_blockimpl_vs_entrywise_full_matrix_d_10_n_3) {
+    for (uint32_t i = 2; i < 7; ++i) {
+        run_same_blockimpl_vs_entrywise<float>(10, 3, 1.0, i);
+        run_same_blockimpl_vs_entrywise<float>(10, 3, 0.2, i);
+        run_same_blockimpl_vs_entrywise<float>(10, 3, 5.9, i);
+    }
+}
+
+TEST_F(TestPDK_SquaredExponential, test_orthogonal_columns) {
+    for (uint32_t i = 70; i < 75; ++i) {
+        run_orthogonal(5, 0.5, i);
+        run_orthogonal(5, 1.1, i);
+        run_orthogonal(5, 3.0, i);
+    }
+}
+
+
+class TestPDK_RBFKernelMatrix : public ::testing::Test {
+    protected:
+
+    virtual void SetUp() {};
+
+    virtual void TearDown() {};
+
+    template <typename T>
+    void run(T bandwidth, T reg, int64_t m, int64_t d, uint32_t seed, bool use_reg = true) {
+        RNGState state_x(seed);
+        DenseDist D(d, m);
+        vector<T> X_vec(d*m);
+        T* X = X_vec.data();
+        RandBLAS::fill_dense(D, X, state_x);
+        vector<T> regs(1,reg); 
+        RandLAPACK::linops::RBFKernelMatrix K(m, X, d, bandwidth, regs);
+        K.set_eval_includes_reg(use_reg);
+
+        vector<T> eye(m * m, 0.0);
+        vector<T> sq_colnorms(m, 0.0);
+        for (int64_t i = 0; i < m; ++i) {
+            eye[i + m*i] = 1.0;
+            sq_colnorms[i] = std::pow(blas::nrm2(d, X + i*d, 1), 2);
+        }
+        vector<T> K_out_expect(m * m, 0.0);
+
+        // (alpha, beta) = (0.25, 0.0), 
+        T alpha = 0.25;
+        RandLAPACK::squared_exp_kernel_submatrix(
+            d, m, X, sq_colnorms.data(), m, m, K_out_expect.data(), 0, 0, bandwidth
+        );
+        blas::scal(m * m, alpha, K_out_expect.data(), 1);
+        if (use_reg) {
+            for (int i = 0; i < m; ++i)
+                K_out_expect[i + i*m] += alpha * reg;
+        }
+        vector<T> K_out_actual1(m * m, 1.0);
+        K(blas::Layout::ColMajor, m, alpha, eye.data(), m, 0.0, K_out_actual1.data(), m);
+
+        T atol = d * std::numeric_limits<T>::epsilon() * (1.0 + std::pow(bandwidth, -2));
+        test::comparison::matrices_approx_equal(
+            blas::Layout::ColMajor, blas::Op::NoTrans, m, m, K_out_actual1.data(), m, 
+            K_out_expect.data(), m, __PRETTY_FUNCTION__, __FILE__, __LINE__, atol, atol
+        );
+        
+        // Expected output when (alpha, beta) = (0.25, 0.3)
+        T beta = 0.3;
+        for (int i = 0; i < m*m; ++i)
+            K_out_expect[i] += beta;
+        vector<T> K_out_actual2(m * m, 1.0);
+        K(blas::Layout::ColMajor, m, alpha, eye.data(), m, beta, K_out_actual2.data(), m);
+
+        test::comparison::matrices_approx_equal(
+            blas::Layout::ColMajor, blas::Op::NoTrans, m, m, K_out_actual2.data(), m, 
+            K_out_expect.data(), m, __PRETTY_FUNCTION__, __FILE__, __LINE__, atol, atol
+        );
+        return;
+    }
+
+};
+
+TEST_F(TestPDK_RBFKernelMatrix, apply_to_eye_m100_d3) {
+    double mu = 0.123;
+    for (uint32_t i = 77; i < 80; ++i) {
+        run(1.0,      mu, 100, 3, i, false);
+        run(2.0,      mu, 100, 3, i, false);
+        run(2.345678, mu, 100, 3, i, false);
+    }
+}
+
+TEST_F(TestPDK_RBFKernelMatrix, apply_to_eye_m256_d4) {
+    double mu = 0.123;
+    for (uint32_t i = 77; i < 80; ++i) {
+        run(1.0,      mu, 256, 4, i, false);
+        run(2.0,      mu, 256, 4, i, false);
+        run(2.345678, mu, 256, 4, i, false);
+    }
+}
+
+TEST_F(TestPDK_RBFKernelMatrix, apply_to_eye_m999_d7) {
+    double mu = 0.123;
+    for (uint32_t i = 77; i < 80; ++i) {
+        run(1.0,      mu, 999, 7, i, false);
+        run(2.0,      mu, 999, 7, i, false);
+        run(2.345678, mu, 999, 7, i, false);
+    }
+}
+
+TEST_F(TestPDK_RBFKernelMatrix, reg_apply_to_eye_m100_d3) {
+    double bandwidth = 1.1;
+    for (uint32_t i = 77; i < 80; ++i) {
+        run(bandwidth, 0.1,      100, 3, i);
+        run(bandwidth, 1.0,      100, 3, i);
+        run(bandwidth, 7.654321, 100, 3, i);
+    }
+}
+
+TEST_F(TestPDK_RBFKernelMatrix, reg_apply_to_eye_m256_d4) {
+    double bandwidth = 1.1;
+    for (uint32_t i = 77; i < 80; ++i) {
+        run(bandwidth, 0.1,      256, 4, i);
+        run(bandwidth, 1.0,      256, 4, i);
+        run(bandwidth, 7.654321, 256, 4, i);
+    }
+}
+
+TEST_F(TestPDK_RBFKernelMatrix, reg_apply_to_eye_m257_d5) {
+    double bandwidth = 1.1;
+    for (uint32_t i = 77; i < 80; ++i) {
+        run(bandwidth, 0.1,      257, 5, i);
+        run(bandwidth, 1.0,      257, 5, i);
+        run(bandwidth, 7.654321, 257, 5, i);
+    }
+}
\ No newline at end of file
diff --git a/test/comps/test_util.cc b/test/misc/test_util.cc
similarity index 99%
rename from test/comps/test_util.cc
rename to test/misc/test_util.cc
index a2a80b80..9474ce03 100644
--- a/test/comps/test_util.cc
+++ b/test/misc/test_util.cc
@@ -349,10 +349,10 @@ class Test_Inplace_Square_Transpose : public ::testing::Test
 
     virtual void apply(blas::Layout layout) {
         int64_t n = 37;
-        RandBLAS::DenseDist D{n, n};
+        RandBLAS::DenseDist D(n, n);
         RandBLAS::RNGState state(1);
         double *A1 = new double[n*n];
-        state = RandBLAS::fill_dense(D, A1, state).second;
+        state = RandBLAS::fill_dense(D, A1, state);
         double *A2 = new double[n*n];
         blas::copy(n*n, A1, 1, A2, 1);
         RandLAPACK::util::transpose_square(A2, n);
diff --git a/test/moremats.hh b/test/moremats.hh
new file mode 100644
index 00000000..d91eb992
--- /dev/null
+++ b/test/moremats.hh
@@ -0,0 +1,60 @@
+#pragma once
+
+#include <blas.hh>
+#include <RandBLAS.hh>
+#include <RandLAPACK.hh>
+#include <gtest/gtest.h>
+#include <math.h>
+#include <lapack.hh>
+#include "../RandLAPACK/RandBLAS/test/comparison.hh"
+
+
+namespace RandLAPACK_Testing {
+
+using std::vector;
+using blas::Layout;
+using blas::Op;
+using blas::Uplo;
+using RandBLAS::RNGState;
+
+template <typename T>
+vector<T> polynomial_decay_psd(int64_t m, T cond_num, T exponent, uint32_t seed) {
+    RandLAPACK::gen::mat_gen_info<T> mat_info(m, m, RandLAPACK::gen::polynomial);
+    mat_info.cond_num = std::sqrt(cond_num);
+    mat_info.rank = m;
+    mat_info.exponent = std::sqrt(exponent);
+    mat_info.frac_spectrum_one = 0.05;
+    vector<T> A(m * m, 0.0);
+    RNGState data_state(seed);
+    RandLAPACK::gen::mat_gen(mat_info, A.data(), data_state);
+    vector<T> G(m * m, 0.0);
+    blas::syrk(Layout::ColMajor, Uplo::Upper, Op::NoTrans, m, m, 1.0,
+        A.data(), m, 0.0, G.data(), m
+    ); // Note: G is PSD with squared spectrum of A.
+    RandBLAS::symmetrize(Layout::ColMajor, Uplo::Upper, m, G.data(), m);
+    return G;
+}
+
+template <typename T>
+vector<T> random_gaussian_mat(int64_t m, int64_t n, uint32_t seed) {
+    RandBLAS::DenseDist D(m, n);
+    RNGState state(seed);
+    vector<T> mat(m*n);
+    RandBLAS::fill_dense(D, mat.data(), state);
+    return mat;
+}
+
+template <typename T, typename RNG>
+RNGState<RNG> left_multiply_by_orthmat(int64_t m, int64_t n, std::vector<T> &A, RNGState<RNG> state) {
+    using std::vector;
+    vector<T> U(m * m, 0.0);
+    RandBLAS::DenseDist DU(m, m);
+    auto out_state = RandBLAS::fill_dense(DU, U.data(), state);
+    vector<T> tau(m, 0.0);
+    lapack::geqrf(m, m, U.data(), m, tau.data());
+    lapack::ormqr(blas::Side::Left, blas::Op::NoTrans, m, n, m, U.data(), m, tau.data(), A.data(), m);
+    return out_state;
+}
+
+
+}
\ No newline at end of file