Skip to content

Commit

Permalink
add memory logging
Browse files Browse the repository at this point in the history
  • Loading branch information
rileyjmurray committed Sep 14, 2024
1 parent 9630d3b commit 834681a
Show file tree
Hide file tree
Showing 4 changed files with 214 additions and 9 deletions.
14 changes: 12 additions & 2 deletions RandLAPACK/comps/rl_rpchol.hh
Original file line number Diff line number Diff line change
Expand Up @@ -107,8 +107,8 @@ int downdate_d_and_cdf(Layout layout, int64_t N, vector<int64_t> &indices, T* F_
* https://github.com/eepperly/Robust-randomized-preconditioning-for-kernel-ridge-regression/blob/main/code/choleskybase.m
*
*/
template <typename T, typename FUNC_T, typename STATE>
STATE rp_cholesky(int64_t n, FUNC_T &A_stateless, int64_t &k, int64_t* S, T* F, int64_t b, STATE state) {
template <typename T, typename FUNC_T, typename STATE, typename CALLBACK>
STATE rp_cholesky(int64_t n, FUNC_T &A_stateless, int64_t &k, int64_t* S, T* F, int64_t b, STATE state, CALLBACK &cb) {
// TODO: make this function robust to rank-deficient matrices.
using RandBLAS::sample_indices_iid;
using RandBLAS::weights_to_cdf;
Expand All @@ -134,6 +134,7 @@ STATE rp_cholesky(int64_t n, FUNC_T &A_stateless, int64_t &k, int64_t* S, T* F,
std::cout << "weights_to_cdf failed with exit code " << w_status << ".\n";
std::cout << "Returning early, with approximation rank = " << ell << "\n\n";
k = ell;
cb(k);
return state;
}
//
Expand Down Expand Up @@ -174,6 +175,7 @@ STATE rp_cholesky(int64_t n, FUNC_T &A_stateless, int64_t &k, int64_t* S, T* F,
std::cout << "Cholesky failed with exit code " << c_status << ".\n";
std::cout << "Returning early, with approximation rank = " << ell << "\n\n";
k = ell;
cb(k);
return state;
}
blas::trsm(
Expand All @@ -188,6 +190,14 @@ STATE rp_cholesky(int64_t n, FUNC_T &A_stateless, int64_t &k, int64_t* S, T* F,
w_status = _rpchol_impl::downdate_d_and_cdf(layout, n, Sprime, F_panel, d, cdf);
ell = ell + ell_incr;
}
cb(k);
return state;
}

template <typename T, typename FUNC_T, typename STATE>
STATE rp_cholesky(int64_t n, FUNC_T &A_stateless, int64_t &k, int64_t* S, T* F, int64_t b, STATE state) {
auto cb = [](int64_t i) { return i ;};
rp_cholesky(n, A_stateless, k, S, F, b, state, cb);
return state;
}

Expand Down
151 changes: 151 additions & 0 deletions benchmark/bench_kernelalgs/kernelbench_common.hh
Original file line number Diff line number Diff line change
Expand Up @@ -98,3 +98,154 @@ KRR_data mmread_krr_data_dir(std::string dn) {
standardize(data);
return data;
}

namespace memprof {
/*
* Author: David Robert Nadeau
* Site: http://NadeauSoftware.com/
* License: Creative Commons Attribution 3.0 Unported License
* http://creativecommons.org/licenses/by/3.0/deed.en_US
*/

#if defined(_WIN32)
#include <psapi.h>
#include <windows.h>

#elif defined(__unix__) || defined(__unix) || defined(unix) || \
(defined(__APPLE__) && defined(__MACH__))
#include <sys/resource.h>
#include <unistd.h>

#if defined(__APPLE__) && defined(__MACH__)
#include <mach/mach.h>

#elif (defined(_AIX) || defined(__TOS__AIX__)) || \
(defined(__sun__) || defined(__sun) || \
defined(sun) && (defined(__SVR4) || defined(__svr4__)))
#include <fcntl.h>
#include <procfs.h>

#elif defined(__linux__) || defined(__linux) || defined(linux) || \
defined(__gnu_linux__)
#include <stdio.h>

#endif

#else
#error "Cannot define getPeakRSS( ) or getCurrentRSS( ) for an unknown OS."
#endif

/**
* Returns the peak (maximum so far) resident set size (physical
* memory use) measured in bytes, or zero if the value cannot be
* determined on this OS.
*/
inline size_t getPeakRSS() {
#if defined(_WIN32)
/* Windows -------------------------------------------------- */
PROCESS_MEMORY_COUNTERS info;
GetProcessMemoryInfo(GetCurrentProcess(), &info, sizeof(info));
return (size_t)info.PeakWorkingSetSize;

#elif (defined(_AIX) || defined(__TOS__AIX__)) || \
(defined(__sun__) || defined(__sun) || \
defined(sun) && (defined(__SVR4) || defined(__svr4__)))
/* AIX and Solaris ------------------------------------------ */
struct psinfo psinfo;
int fd = -1;
if ((fd = open("/proc/self/psinfo", O_RDONLY)) == -1)
return (size_t)0L; /* Can't open? */
if (read(fd, &psinfo, sizeof(psinfo)) != sizeof(psinfo)) {
close(fd);
return (size_t)0L; /* Can't read? */
}
close(fd);
return (size_t)(psinfo.pr_rssize * 1024L);

#elif defined(__unix__) || defined(__unix) || defined(unix) || \
(defined(__APPLE__) && defined(__MACH__))
/* BSD, Linux, and OSX -------------------------------------- */
struct rusage rusage;
getrusage(RUSAGE_SELF, &rusage);
#if defined(__APPLE__) && defined(__MACH__)
return (size_t)rusage.ru_maxrss;
#else
return (size_t)(rusage.ru_maxrss * 1024L);
#endif

#else
/* Unknown OS ----------------------------------------------- */
return (size_t)0L; /* Unsupported. */
#endif
}

/**
* Returns the current resident set size (physical memory use) measured
* in bytes, or zero if the value cannot be determined on this OS.
*/
inline size_t getCurrentRSS() {
#if defined(_WIN32)
/* Windows -------------------------------------------------- */
PROCESS_MEMORY_COUNTERS info;
GetProcessMemoryInfo(GetCurrentProcess(), &info, sizeof(info));
return (size_t)info.WorkingSetSize;

#elif defined(__APPLE__) && defined(__MACH__)
/* OSX ------------------------------------------------------ */
struct mach_task_basic_info info;
mach_msg_type_number_t infoCount = MACH_TASK_BASIC_INFO_COUNT;
if (task_info(mach_task_self(), MACH_TASK_BASIC_INFO, (task_info_t)&info,
&infoCount) != KERN_SUCCESS)
return (size_t)0L; /* Can't access? */
return (size_t)info.resident_size;

#elif defined(__linux__) || defined(__linux) || defined(linux) || \
defined(__gnu_linux__)
/* Linux ---------------------------------------------------- */
long rss = 0L;
FILE *fp = NULL;
if ((fp = fopen("/proc/self/statm", "r")) == NULL)
return (size_t)0L; /* Can't open? */
if (fscanf(fp, "%*s%ld", &rss) != 1) {
fclose(fp);
return (size_t)0L; /* Can't read? */
}
fclose(fp);
return (size_t)rss * (size_t)sysconf(_SC_PAGESIZE);

#else
/* AIX, BSD, Solaris, and Unknown OS ------------------------ */
return (size_t)0L; /* Unsupported. */
#endif
}

// inline void log_pages() {
// static size_t pagesize = sysconf(_SC_PAGESIZE);
// int64_t bytes = getCurrentRSS();
// assert((bytes % pagesize) == 0);
// size_t pages = bytes / pagesize;
// std::cout << "page size: " << pagesize << "\t";
// std::cout << "bytes: " << bytes << "\t";
// std::cout << "pages: " << pages << std::endl;
// return;
// }

inline void log_pages(std::ostream &stream) {
static size_t pagesize = sysconf(_SC_PAGESIZE);
int64_t bytes = getCurrentRSS();
assert((bytes % pagesize) == 0);
size_t pages = bytes / pagesize;
stream << "page size: " << pagesize << "\t";
stream << "bytes: " << bytes << "\t";
stream << "pages: " << pages << std::endl;
return;
}

inline void log_memory_gb(std::ostream &stream) {
int64_t bytes = getCurrentRSS();
double gb = ((double) bytes) / ((double) std::pow(1024,3));
stream << " Memory (GB) : " << gb << "\n";
return;
}

}
23 changes: 16 additions & 7 deletions benchmark/bench_kernelalgs/kpca.cc
Original file line number Diff line number Diff line change
Expand Up @@ -52,17 +52,19 @@ enum TSSVD : char {
RandPrecondCholSVD = 'R'
};

template <typename T>
std::pair<timepoint_t,timepoint_t> convert_svd(int64_t m, int64_t rank, vector<T> &U, vector<T> &kevals, TSSVD cs = TSSVD::GESDD) {
template <typename T, typename CALLBACK>
std::pair<timepoint_t,timepoint_t> convert_svd(int64_t m, int64_t rank, vector<T> &U, vector<T> &kevals, TSSVD cs, CALLBACK &cb) {
auto _tp0 = std_clock::now();
if (cs == TSSVD::GESDD) {
vector<T> work(rank*rank, 0.0);
gesdd(Job::OverwriteVec, m, rank, U.data(), m, kevals.data(), nullptr, 1, work.data(), rank);
for (int64_t i = 0; i < rank; ++i)
kevals[i] = std::pow(kevals[i], 2);
cb(0);
} else if (cs == TSSVD::CholSVD) {
vector<T> work((rank + m)*rank, 0.0);
cholsvd_square(m, rank, U.data(), m, kevals.data(), work.data());
cb(0);
}
auto _tp1 = std_clock::now();
return {_tp0, _tp1};
Expand Down Expand Up @@ -90,27 +92,34 @@ int main() {
RNGState state(0);
vector<int64_t> selection(rank, -1);

std::stringstream strm{};
auto callback = [&strm](int64_t i) { memprof::log_memory_gb(strm); return i;};

std::cout << "RPCholesky (RPC)\n";
std::cout << " block size : " << rpchol_block_size << std::endl;
std::cout << " rank limit : " << rank << std::endl;
auto _tp0 = std_clock::now();
state = rp_cholesky(m, K_reg, rank, selection.data(), U.data(), rpchol_block_size, state);
state = rp_cholesky(m, K_reg, rank, selection.data(), U.data(), rpchol_block_size, state, callback);
auto _tp1 = std_clock::now();
std::cout << " exit rank : " << rank << std::endl;
std::cout << " RPC time (s) : " << DOUT(sec_elapsed(_tp0, _tp1)) << std::endl;
std::cout << strm.str();

strm.str("");
strm.clear();

// Variables for SVD conversion
// We don't allocate these earlier, since "rank" might have decreased
// in the call to rp_cholesky.
vector<T> kevals(rank, 0.0);

{
auto [tp0, tp1] = convert_svd(m, rank, U, kevals, TSSVD::CholSVD);
std::cout << " SVD time (s) : " << DOUT(sec_elapsed(tp0, tp1)) << "\n\n";
auto [tp0, tp1] = convert_svd(m, rank, U, kevals, TSSVD::CholSVD, callback);
std::cout << " SVD time (s) : " << DOUT(sec_elapsed(tp0, tp1)) << "\n";
std::cout << strm.str() << "\n";
}
// Now check || K_reg @ U[:, 0:num_pc] - U[:,0:num_pc] @ diag(eivals[0:num_pc]) ||,
// or || K_reg @ U[:, 0:num_pc] @ inv(diag(eigvals[0:num_pc])) - U[:,0:num_pc]||
int64_t num_pc = 2;
int64_t num_pc = 5;
vector<T> V(m*num_pc, 0.0);
T onef = 1.0;
K_reg(blas::Layout::ColMajor, num_pc, onef, U.data(), m, (T)0.0, V.data(), m);
Expand Down
35 changes: 35 additions & 0 deletions benchmark/bench_kernelalgs/logging.txt
Original file line number Diff line number Diff line change
@@ -1,5 +1,40 @@

KPCA
====
Our implementation
Dataset
/Users/rjmurr/Documents/open-data/kernel-ridge-regression/cod-rna
cols : 59535
rows : 8

RPCholesky (RPC)
block size : 64
rank limit : 243
exit rank : 243
RPC time (s) : 0.058426
SVD time (s) : 0.056271

Error in KPCA components
component 0 : 3.165351e-10
component 1 : 2.4040413e-08

Python implementations

Dataset dimensions (RandLAPACK's convention)
n_rows : 8
n_cols : 59535
44.33819890022278 seconds for sklearn's KPCA.


Dataset dimensions (RandLAPACK's convention)
n_rows : 8
n_cols : 59535
0.3426549434661865 seconds for Ethan's RPCholesky, with block size 64.



KRR
===
CONLUSIONS

1. Performance is best with -O1 (13 seconds vs 17 seconds from -O0, but same result as -O0).
Expand Down

0 comments on commit 834681a

Please sign in to comment.