diff --git a/CMakeLists.txt b/CMakeLists.txt index d2871ac0..ef224b15 100755 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -56,6 +56,9 @@ else () endif () endif () +# For the chrones library (profiler) +include_directories(/usr/users/gpusvm/plowiecki_sta/.local/lib/python3.8/site-packages/Chrones/instrumentation/cpp) + if (CMAKE_VERSION VERSION_LESS "3.1") add_compile_options("-std=c++11") else () diff --git a/include/thundersvm/syncmem.h b/include/thundersvm/syncmem.h index c86e1484..b9aefbb0 100644 --- a/include/thundersvm/syncmem.h +++ b/include/thundersvm/syncmem.h @@ -6,6 +6,7 @@ #define THUNDERSVM_SYNCMEM_H #include +#include namespace thunder { inline void malloc_host(void **ptr, size_t size) { @@ -25,6 +26,7 @@ namespace thunder { } inline void device_mem_copy(void *dst, const void *src, size_t size) { + CHRONE(); #ifdef USE_CUDA CUDA_CHECK(cudaMemcpy(dst, src, size, cudaMemcpyDefault)); #else @@ -68,9 +70,11 @@ namespace thunder { ///transfer data to host void to_host(); + void __to_host(); ///transfer data to device void to_device(); + void __to_device(); ///return the size of memory size_t size() const; diff --git a/include/thundersvm/thundersvm.h b/include/thundersvm/thundersvm.h index a9314e9a..07af53cc 100644 --- a/include/thundersvm/thundersvm.h +++ b/include/thundersvm/thundersvm.h @@ -13,7 +13,7 @@ #include "util/common.h" using std::string; using std::vector; -typedef double float_type; +typedef float float_type; #ifdef USE_DOUBLE typedef double kernel_type; diff --git a/src/thundersvm/kernel/kernelmatrix_kernel.cu b/src/thundersvm/kernel/kernelmatrix_kernel.cu index 2bd947d3..9fc7392a 100644 --- a/src/thundersvm/kernel/kernelmatrix_kernel.cu +++ b/src/thundersvm/kernel/kernelmatrix_kernel.cu @@ -200,6 +200,7 @@ namespace svm_kernel { RBF_kernel(const SyncArray &self_dot0_idx, const SyncArray &self_dot1, SyncArray &dot_product, int m, int n, kernel_type gamma) { + CHRONE(); SAFE_KERNEL_LAUNCH(kernel_RBF_kernel, self_dot0_idx.device_data(), self_dot1.device_data(), dot_product.device_data(), m, n, gamma); } diff --git a/src/thundersvm/kernel/smo_kernel.cu b/src/thundersvm/kernel/smo_kernel.cu index 6d9db31c..a2d58006 100644 --- a/src/thundersvm/kernel/smo_kernel.cu +++ b/src/thundersvm/kernel/smo_kernel.cu @@ -257,6 +257,7 @@ namespace svm_kernel { const SyncArray &working_set, float_type Cp, float_type Cn, const SyncArray &k_mat_rows, const SyncArray &k_mat_diag, int row_len, float_type eps, SyncArray &diff, int max_iter) { + CHRONE(); size_t ws_size = working_set.size(); size_t smem_size = 0; smem_size += ws_size * sizeof(int); //f_idx2reduce @@ -291,9 +292,9 @@ namespace svm_kernel { int n_instances) { //"n_instances" equals to the number of rows of the whole kernel matrix for both SVC and SVR. KERNEL_LOOP(idx, n_instances) {//one thread to update multiple fvalues. - double sum_diff = 0; + float_type sum_diff = 0; for (int i = 0; i < ws_size; ++i) { - double d = alpha_diff[i]; + float_type d = alpha_diff[i]; if (d != 0) { sum_diff += d * k_mat_rows[i * n_instances + idx]; } @@ -305,6 +306,7 @@ namespace svm_kernel { void update_f(SyncArray &f, const SyncArray &alpha_diff, const SyncArray &k_mat_rows, int n_instances) { + CHRONE(); SAFE_KERNEL_LAUNCH(update_f_kernel, f.device_data(), alpha_diff.size(), alpha_diff.device_data(), k_mat_rows.device_data(), n_instances); } diff --git a/src/thundersvm/kernelmatrix.cpp b/src/thundersvm/kernelmatrix.cpp index 27dbe4be..21906dcf 100644 --- a/src/thundersvm/kernelmatrix.cpp +++ b/src/thundersvm/kernelmatrix.cpp @@ -181,6 +181,7 @@ void CSR_DenseCSR(size_t m,size_t n,vector &csr_val,vector &cs } KernelMatrix::KernelMatrix(const DataSet::node2d &instances, SvmParam param) { + CHRONE(); n_instances_ = instances.size(); n_features_ = 0; this->param = param; @@ -250,6 +251,7 @@ KernelMatrix::KernelMatrix(const DataSet::node2d &instances, SvmParam param) { void KernelMatrix::get_rows(const SyncArray &idx, SyncArray &kernel_rows) const {//compute multiple rows of kernel matrix according to idx + CHRONE(); CHECK_GE(kernel_rows.size(), idx.size() * n_instances_) << "kernel_rows memory is too small"; #ifdef USE_CUDA get_dot_product_dns_csr_dns_dns(idx, sparse_mat_,dense_mat_,kernel_rows); @@ -316,7 +318,7 @@ const SyncArray &KernelMatrix::diag() const { } void KernelMatrix::get_dot_product_dns_csr_dns_dns(const SyncArray &idx,const SparseData &sparse,const DenseData &dense,SyncArray &dot_product) const{ - + CHRONE(); //get sparse part result matrix and dense part result matrix diff --git a/src/thundersvm/model/nusvc.cpp b/src/thundersvm/model/nusvc.cpp index aefdd8e8..e1205c2d 100644 --- a/src/thundersvm/model/nusvc.cpp +++ b/src/thundersvm/model/nusvc.cpp @@ -19,12 +19,12 @@ void NuSVC::train_binary(const DataSet &dataset, int i, int j, SyncArray ori = dataset.original_index(i, j); diff --git a/src/thundersvm/model/svc.cpp b/src/thundersvm/model/svc.cpp index eca7bf82..203d30d9 100644 --- a/src/thundersvm/model/svc.cpp +++ b/src/thundersvm/model/svc.cpp @@ -39,6 +39,7 @@ void SVC::model_setup(const DataSet &dataset, SvmParam ¶m) { } void SVC::train(const DataSet &dataset, SvmParam param) { + CHRONE(); DataSet dataset_ = dataset; dataset_.group_classes(); model_setup(dataset_, param); @@ -49,6 +50,7 @@ void SVC::train(const DataSet &dataset, SvmParam param) { int k = 0; for (int i = 0; i < n_classes; ++i) { for (int j = i + 1; j < n_classes; ++j) { + CHRONE("binary", (i+1)*j); train_binary(dataset_, i, j, alpha[k], rho.host_data()[k]); vector original_index = dataset_.original_index(i, j); CHECK_EQ(original_index.size(), alpha[k].size()); diff --git a/src/thundersvm/solver/csmosolver.cpp b/src/thundersvm/solver/csmosolver.cpp index 37a04f73..f676902f 100644 --- a/src/thundersvm/solver/csmosolver.cpp +++ b/src/thundersvm/solver/csmosolver.cpp @@ -5,12 +5,15 @@ #include #include +CHRONABLE("smosolver"); + using namespace svm_kernel; void CSMOSolver::solve(const KernelMatrix &k_mat, const SyncArray &y, SyncArray &alpha, float_type &rho, SyncArray &f_val, float_type eps, float_type Cp, float_type Cn, int ws_size, int out_max_iter) const { + CHRONE(); int n_instances = k_mat.n_instances(); int q = ws_size / 2; @@ -57,6 +60,7 @@ CSMOSolver::solve(const KernelMatrix &k_mat, const SyncArray &y, SyncArray< float_type second_last_local_diff = INFINITY; for (int iter = 0;; ++iter) { + CHRONE("iteration", iter); //select working set f_idx2sort.copy_from(f_idx); f_val2sort.copy_from(f_val); @@ -124,6 +128,7 @@ void CSMOSolver::select_working_set(vector &ws_indicator, const SyncArray &f_idx2sort, const SyncArray &y, const SyncArray &alpha, float_type Cp, float_type Cn, SyncArray &working_set) const { + CHRONE(); int n_instances = ws_indicator.size(); int p_left = 0; int p_right = n_instances - 1; diff --git a/src/thundersvm/syncarray.cpp b/src/thundersvm/syncarray.cpp index 3970b2aa..898b69c6 100644 --- a/src/thundersvm/syncarray.cpp +++ b/src/thundersvm/syncarray.cpp @@ -48,6 +48,7 @@ void SyncArray::resize(size_t count) { template void SyncArray::copy_from(const T *source, size_t count) { #ifdef USE_CUDA + cudaDeviceSynchronize(); thunder::device_mem_copy(mem->device_data(), source, sizeof(T) * count); #else memcpy(mem->host_data(), source, sizeof(T) * count); diff --git a/src/thundersvm/syncmem.cpp b/src/thundersvm/syncmem.cpp index 9d14fcfe..ae520c20 100644 --- a/src/thundersvm/syncmem.cpp +++ b/src/thundersvm/syncmem.cpp @@ -56,6 +56,12 @@ namespace thunder { } void SyncMem::to_host() { + cudaDeviceSynchronize(); + __to_host(); + } + + void SyncMem::__to_host() { + CHRONE(); switch (head_) { case UNINITIALIZED: malloc_host(&host_ptr, size_); @@ -82,6 +88,12 @@ namespace thunder { } void SyncMem::to_device() { + cudaDeviceSynchronize(); + __to_device(); + } + + void SyncMem::__to_device() { + CHRONE(); #ifdef USE_CUDA switch (head_) { case UNINITIALIZED: