diff --git a/CMakeLists.txt b/CMakeLists.txt
index d2871ac0..ef224b15 100755
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -56,6 +56,9 @@ else ()
     endif ()
 endif ()
 
+# For the chrones library (profiler)
+include_directories(/usr/users/gpusvm/plowiecki_sta/.local/lib/python3.8/site-packages/Chrones/instrumentation/cpp)
+
 if (CMAKE_VERSION VERSION_LESS "3.1")
     add_compile_options("-std=c++11")
 else ()
diff --git a/include/thundersvm/syncmem.h b/include/thundersvm/syncmem.h
index c86e1484..b9aefbb0 100644
--- a/include/thundersvm/syncmem.h
+++ b/include/thundersvm/syncmem.h
@@ -6,6 +6,7 @@
 #define THUNDERSVM_SYNCMEM_H
 
 #include <thundersvm/thundersvm.h>
+#include <chrones.hpp>
 
 namespace thunder {
     inline void malloc_host(void **ptr, size_t size) {
@@ -25,6 +26,7 @@ namespace thunder {
     }
 
     inline void device_mem_copy(void *dst, const void *src, size_t size) {
+        CHRONE();
 #ifdef USE_CUDA
         CUDA_CHECK(cudaMemcpy(dst, src, size, cudaMemcpyDefault));
 #else
@@ -68,9 +70,11 @@ namespace thunder {
 
         ///transfer data to host
         void to_host();
+        void __to_host();
 
         ///transfer data to device
         void to_device();
+        void __to_device();
 
         ///return the size of memory
         size_t size() const;
diff --git a/include/thundersvm/thundersvm.h b/include/thundersvm/thundersvm.h
index a9314e9a..07af53cc 100644
--- a/include/thundersvm/thundersvm.h
+++ b/include/thundersvm/thundersvm.h
@@ -13,7 +13,7 @@
 #include "util/common.h"
 using std::string;
 using std::vector;
-typedef double float_type;
+typedef float float_type;
 
 #ifdef USE_DOUBLE
 typedef double kernel_type;
diff --git a/src/thundersvm/kernel/kernelmatrix_kernel.cu b/src/thundersvm/kernel/kernelmatrix_kernel.cu
index 2bd947d3..9fc7392a 100644
--- a/src/thundersvm/kernel/kernelmatrix_kernel.cu
+++ b/src/thundersvm/kernel/kernelmatrix_kernel.cu
@@ -200,6 +200,7 @@ namespace svm_kernel {
     RBF_kernel(const SyncArray<int> &self_dot0_idx, const SyncArray<kernel_type> &self_dot1,
                SyncArray<kernel_type> &dot_product, int m,
                int n, kernel_type gamma) {
+        CHRONE();
         SAFE_KERNEL_LAUNCH(kernel_RBF_kernel, self_dot0_idx.device_data(), self_dot1.device_data(),
                            dot_product.device_data(), m, n, gamma);
     }
diff --git a/src/thundersvm/kernel/smo_kernel.cu b/src/thundersvm/kernel/smo_kernel.cu
index 6d9db31c..a2d58006 100644
--- a/src/thundersvm/kernel/smo_kernel.cu
+++ b/src/thundersvm/kernel/smo_kernel.cu
@@ -257,6 +257,7 @@ namespace svm_kernel {
                 const SyncArray<int> &working_set, float_type Cp, float_type Cn, const SyncArray<kernel_type> &k_mat_rows,
                 const SyncArray<kernel_type> &k_mat_diag, int row_len, float_type eps, SyncArray<float_type> &diff,
                 int max_iter) {
+        CHRONE();
         size_t ws_size = working_set.size();
         size_t smem_size = 0;
         smem_size += ws_size * sizeof(int); //f_idx2reduce
@@ -291,9 +292,9 @@ namespace svm_kernel {
                     int n_instances) {
         //"n_instances" equals to the number of rows of the whole kernel matrix for both SVC and SVR.
         KERNEL_LOOP(idx, n_instances) {//one thread to update multiple fvalues.
-            double sum_diff = 0;
+            float_type sum_diff = 0;
             for (int i = 0; i < ws_size; ++i) {
-                double d = alpha_diff[i];
+                float_type d = alpha_diff[i];
                 if (d != 0) {
                     sum_diff += d * k_mat_rows[i * n_instances + idx];
                 }
@@ -305,6 +306,7 @@ namespace svm_kernel {
     void
     update_f(SyncArray<float_type> &f, const SyncArray<float_type> &alpha_diff, const SyncArray<kernel_type> &k_mat_rows,
              int n_instances) {
+        CHRONE();
         SAFE_KERNEL_LAUNCH(update_f_kernel, f.device_data(), alpha_diff.size(), alpha_diff.device_data(),
                            k_mat_rows.device_data(), n_instances);
     }
diff --git a/src/thundersvm/kernelmatrix.cpp b/src/thundersvm/kernelmatrix.cpp
index 27dbe4be..21906dcf 100644
--- a/src/thundersvm/kernelmatrix.cpp
+++ b/src/thundersvm/kernelmatrix.cpp
@@ -181,6 +181,7 @@ void CSR_DenseCSR(size_t m,size_t n,vector<kernel_type> &csr_val,vector<int> &cs
 }
 
 KernelMatrix::KernelMatrix(const DataSet::node2d &instances, SvmParam param) {
+    CHRONE();
     n_instances_ = instances.size();
     n_features_ = 0;
     this->param = param;
@@ -250,6 +251,7 @@ KernelMatrix::KernelMatrix(const DataSet::node2d &instances, SvmParam param) {
 
 void KernelMatrix::get_rows(const SyncArray<int> &idx,
                             SyncArray<kernel_type> &kernel_rows) const {//compute multiple rows of kernel matrix according to idx
+    CHRONE();
     CHECK_GE(kernel_rows.size(), idx.size() * n_instances_) << "kernel_rows memory is too small";
 #ifdef USE_CUDA
     get_dot_product_dns_csr_dns_dns(idx, sparse_mat_,dense_mat_,kernel_rows);
@@ -316,7 +318,7 @@ const SyncArray<kernel_type> &KernelMatrix::diag() const {
 }
 
 void KernelMatrix::get_dot_product_dns_csr_dns_dns(const SyncArray<int> &idx,const SparseData &sparse,const DenseData &dense,SyncArray<kernel_type> &dot_product) const{
-    
+    CHRONE();
     
     //get sparse part result matrix and dense part result matrix
     
diff --git a/src/thundersvm/model/nusvc.cpp b/src/thundersvm/model/nusvc.cpp
index aefdd8e8..e1205c2d 100644
--- a/src/thundersvm/model/nusvc.cpp
+++ b/src/thundersvm/model/nusvc.cpp
@@ -19,12 +19,12 @@ void NuSVC::train_binary(const DataSet &dataset, int i, int j, SyncArray<float_t
     float_type *alpha_data = alpha.host_data();
     for (int l = 0; l < n_pos; ++l) {
         y_data[l] = +1;
-        alpha_data[l] = min(1., sum_pos);
+        alpha_data[l] = min((float_type)1., sum_pos);
         sum_pos -= alpha_data[l];
     }
     for (int l = 0; l < n_neg; ++l) {
         y_data[n_pos + l] = -1;
-        alpha_data[n_pos + l] = min(1., sum_neg);
+        alpha_data[n_pos + l] = min((float_type)1., sum_neg);
         sum_neg -= alpha_data[n_pos + l];
     }
     vector<int> ori = dataset.original_index(i, j);
diff --git a/src/thundersvm/model/svc.cpp b/src/thundersvm/model/svc.cpp
index eca7bf82..203d30d9 100644
--- a/src/thundersvm/model/svc.cpp
+++ b/src/thundersvm/model/svc.cpp
@@ -39,6 +39,7 @@ void SVC::model_setup(const DataSet &dataset, SvmParam &param) {
 }
 
 void SVC::train(const DataSet &dataset, SvmParam param) {
+    CHRONE();
     DataSet dataset_ = dataset;
     dataset_.group_classes();
     model_setup(dataset_, param);
@@ -49,6 +50,7 @@ void SVC::train(const DataSet &dataset, SvmParam param) {
     int k = 0;
     for (int i = 0; i < n_classes; ++i) {
         for (int j = i + 1; j < n_classes; ++j) {
+            CHRONE("binary", (i+1)*j);
             train_binary(dataset_, i, j, alpha[k], rho.host_data()[k]);
             vector<int> original_index = dataset_.original_index(i, j);
             CHECK_EQ(original_index.size(), alpha[k].size());
diff --git a/src/thundersvm/solver/csmosolver.cpp b/src/thundersvm/solver/csmosolver.cpp
index 37a04f73..f676902f 100644
--- a/src/thundersvm/solver/csmosolver.cpp
+++ b/src/thundersvm/solver/csmosolver.cpp
@@ -5,12 +5,15 @@
 #include <thundersvm/kernel/smo_kernel.h>
 #include <climits>
 
+CHRONABLE("smosolver");
+
 using namespace svm_kernel;
 
 void
 CSMOSolver::solve(const KernelMatrix &k_mat, const SyncArray<int> &y, SyncArray<float_type> &alpha, float_type &rho,
                   SyncArray<float_type> &f_val, float_type eps, float_type Cp, float_type Cn, int ws_size,
                   int out_max_iter) const {
+    CHRONE();
     int n_instances = k_mat.n_instances();
     int q = ws_size / 2;
 
@@ -57,6 +60,7 @@ CSMOSolver::solve(const KernelMatrix &k_mat, const SyncArray<int> &y, SyncArray<
     float_type second_last_local_diff = INFINITY;
 
     for (int iter = 0;; ++iter) {
+        CHRONE("iteration", iter);
         //select working set
         f_idx2sort.copy_from(f_idx);
         f_val2sort.copy_from(f_val);
@@ -124,6 +128,7 @@ void
 CSMOSolver::select_working_set(vector<int> &ws_indicator, const SyncArray<int> &f_idx2sort, const SyncArray<int> &y,
                                const SyncArray<float_type> &alpha, float_type Cp, float_type Cn,
                                SyncArray<int> &working_set) const {
+    CHRONE();
     int n_instances = ws_indicator.size();
     int p_left = 0;
     int p_right = n_instances - 1;
diff --git a/src/thundersvm/syncarray.cpp b/src/thundersvm/syncarray.cpp
index 3970b2aa..898b69c6 100644
--- a/src/thundersvm/syncarray.cpp
+++ b/src/thundersvm/syncarray.cpp
@@ -48,6 +48,7 @@ void SyncArray<T>::resize(size_t count) {
 template<typename T>
 void SyncArray<T>::copy_from(const T *source, size_t count) {
 #ifdef USE_CUDA
+    cudaDeviceSynchronize();
     thunder::device_mem_copy(mem->device_data(), source, sizeof(T) * count);
 #else
     memcpy(mem->host_data(), source, sizeof(T) * count);
diff --git a/src/thundersvm/syncmem.cpp b/src/thundersvm/syncmem.cpp
index 9d14fcfe..ae520c20 100644
--- a/src/thundersvm/syncmem.cpp
+++ b/src/thundersvm/syncmem.cpp
@@ -56,6 +56,12 @@ namespace thunder {
     }
 
     void SyncMem::to_host() {
+        cudaDeviceSynchronize();
+        __to_host();
+    }
+
+    void SyncMem::__to_host() {
+        CHRONE();
         switch (head_) {
             case UNINITIALIZED:
                 malloc_host(&host_ptr, size_);
@@ -82,6 +88,12 @@ namespace thunder {
     }
 
     void SyncMem::to_device() {
+        cudaDeviceSynchronize();
+        __to_device();
+    }
+
+    void SyncMem::__to_device() {
+        CHRONE();
 #ifdef USE_CUDA
         switch (head_) {
             case UNINITIALIZED: