Implemented get_[data|view]_by_spectral() + fixed a bug in SpectralCl…

…usteringVisitor
hosseinmoein · Jan 6, 2025 · 76e93d1 · 76e93d1
1 parent 0d937aa
commit 76e93d1
Show file tree

Hide file tree

Showing 8 changed files with 533 additions and 63 deletions.
diff --git a/docs/HTML/DataFrame.html b/docs/HTML/DataFrame.html
@@ -429,6 +429,10 @@ <H2 ID="2"><font color="blue">API Reference with code samples <font size="+4">&#
       <td title="Gets data or view by selection"><a href="https://htmlpreview.github.io/?https://github.com/hosseinmoein/DataFrame/blob/master/docs/HTML/get_data_by_sel.html">get_data_by_sel( 5 )<BR>get_view_by_sel( 5 )</a></td>
     </tr>
 
+    <tr class="item" onmouseover="this.style.backgroundColor='#ffff66';" onmouseout="this.style.backgroundColor='#d4e3e5';">
+      <td title="Gets data or views by applying spectral clustering algorithm"><a href="https://htmlpreview.github.io/?https://github.com/hosseinmoein/DataFrame/blob/master/docs/HTML/get_data_by_spectral.html">get_data_by_spectral()<BR>get_view_by_spectral()</a></td>
+    </tr>
+
     <tr class="item" onmouseover="this.style.backgroundColor='#ffff66';" onmouseout="this.style.backgroundColor='#d4e3e5';">
       <td title="Gets data or view by standard deviation"><a href="https://htmlpreview.github.io/?https://github.com/hosseinmoein/DataFrame/blob/master/docs/HTML/get_data_by_stdev.html">get_data_by_stdev()<BR>get_view_by_stdev()</a></td>
     </tr>

diff --git a/docs/HTML/SpectralClusteringVisitor.html b/docs/HTML/SpectralClusteringVisitor.html
diff --git a/docs/HTML/get_data_by_spectral.html b/docs/HTML/get_data_by_spectral.html
diff --git a/include/DataFrame/DataFrame.h b/include/DataFrame/DataFrame.h
@@ -3331,6 +3331,81 @@ class   DataFrame : public ThreadGranularity {
                        size_type num_of_iter = 1000,
                        seed_t seed = seed_t(-1)) const;
 
+    // This uses spectral clustering algorithm to divide the named column into
+    // K clusters. It returns an array of K DataFrame's each containing one of
+    // the clusters of data based on the named column.
+    // Self in unchanged.
+    //
+    // NOTE: Type T must support arithmetic operations
+    //
+    // K:
+    //   Number of clusters for k-means clustering algorithm
+    // T:
+    //   Type of the named column
+    // Ts:
+    //   List all the types of all data columns. A type should be specified in
+    //   the list only once.
+    // col_name:
+    //   Name of the given column
+    // sfunc:
+    //   A function to calculate the similarity matrix between data points in
+    //   the named column
+    // num_of_iter:
+    //   Maximum number of iterations for k-means clustering algorithm before
+    //   converging
+    // seed:
+    //   Seed for random number generator to initialize k-means clustering
+    //   algorithm. Default is a random numbers for each call.
+    //
+    template<std::size_t K, arithmetic T, typename ... Ts>
+    [[nodiscard]]
+    std::array<DataFrame<I, HeteroVector<std::size_t(H::align_value)>>, K>
+    get_data_by_spectral(const char *col_name,
+                         double sigma,
+                         seed_t seed = seed_t(-1),
+                         std::function<double(const T &x, const T &y,
+                                              double sigma)>  &&sfunc =
+                             [](const T &x, const T &y,
+                                double sigma) -> double  {
+                                 return (std::exp(-((x - y) * (x - y)) /
+                                                  (2 * sigma * sigma)));
+                             },
+                         size_type num_of_iter = 1000) const;
+
+    // Same as above but it returns an array of Views.
+    //
+    template<std::size_t K, arithmetic T, typename ... Ts>
+    [[nodiscard]]
+    std::array<PtrView, K>
+    get_view_by_spectral(const char *col_name,
+                         double sigma,
+                         seed_t seed = seed_t(-1),
+                         std::function<double(const T &x, const T &y,
+                                              double sigma)>  &&sfunc =
+                             [](const T &x, const T &y,
+                                double sigma) -> double  {
+                                 return (std::exp(-((x - y) * (x - y)) /
+                                                  (2 * sigma * sigma)));
+                             },
+                         size_type num_of_iter = 1000);
+
+    // Same as above but it returns an array of const Views.
+    //
+    template<std::size_t K, arithmetic T, typename ... Ts>
+    [[nodiscard]]
+    std::array<ConstPtrView, K>
+    get_view_by_spectral(const char *col_name,
+                         double sigma,
+                         seed_t seed = seed_t(-1),
+                         std::function<double(const T &x, const T &y,
+                                              double sigma)>  &&sfunc =
+                             [](const T &x, const T &y,
+                                double sigma) -> double  {
+                                 return (std::exp(-((x - y) * (x - y)) /
+                                                  (2 * sigma * sigma)));
+                             },
+                         size_type num_of_iter = 1000) const;
+
     // This uses Affinity Propagation algorithm to divide the named column
     // into clusters. It returns an array of DataFrame's each containing one
     // of the clusters of data based on the named column. Unlike K-Means

diff --git a/include/DataFrame/DataFrameMLVisitors.h b/include/DataFrame/DataFrameMLVisitors.h
@@ -2799,7 +2799,6 @@ struct  SpectralClusteringVisitor  {
     do_kmeans_(const mat_t &eigenvecs)  {
 
         const long      rows = eigenvecs.rows();  // Samples
-        const long      cols = eigenvecs.cols();  // dimensions
         vec_t<long>     cluster_idxs (rows, -1L);
         constexpr long  k = long(K);
 
@@ -2810,11 +2809,14 @@ struct  SpectralClusteringVisitor  {
 
         // Copy the top k rows of eigen vector.
         //
-        mat_t   means { k, cols };
+        mat_t   means { k, k };
 
-        for (long r = 0; r < k; ++r)
-            for (long c = 0; c < cols; ++c)
-                means(r, c) = eigenvecs(r, c);
+        for (long r = 0; r < k; ++r)  {
+            const auto rr = rd_gen(gen);
+
+            for (long c = 0; c < k; ++c)
+                means(r, c) = eigenvecs(rr, c);
+        }
 
         for (size_type iter = 0; iter < iter_num_; ++iter)  {
              // Assign cluster_idxs based on closest means
@@ -2824,7 +2826,7 @@ struct  SpectralClusteringVisitor  {
 
                  for (long rr = 0; rr < k; ++rr) {
                      const double   distance =
-                         distance_func_(eigenvecs, means, r, rr, cols);
+                         distance_func_(eigenvecs, means, r, rr, k);
 
                      if (distance < best_distance) {
                          best_distance = distance;
@@ -2835,24 +2837,24 @@ struct  SpectralClusteringVisitor  {
 
              // Update means
              //
-             mat_t          new_means { k, cols };
+             mat_t          new_means { k, k };
              vec_t<long>    counts (k, 0L);
 
              for (long r = 0; r < rows; ++r) {
-                 for (long c = 0; c < cols; ++c)
+                 for (long c = 0; c < k; ++c)
                      new_means(cluster_idxs[r], c) += eigenvecs(r, c);
                  counts[cluster_idxs[r]]++;
              }
 
              for (int r = 0; r < k; ++r) {
                  if (counts[r] > 0)  {
-                     for (long c = 0; c < cols; ++c)
+                     for (long c = 0; c < k; ++c)
                          new_means(r, c) /= T(counts[r]);
                  }
                  else  { // Reinitialize centroid if no points assigned
                      const auto rr = rd_gen(gen);
 
-                     for (long c = 0; c < cols; ++c)
+                     for (long c = 0; c < k; ++c)
                          new_means(r, c) = eigenvecs(rr, c);
                  }
              }
@@ -2920,13 +2922,13 @@ struct  SpectralClusteringVisitor  {
     SpectralClusteringVisitor(
         size_type num_of_iter,
         double sigma,
+        seed_t seed = seed_t(-1),
         similarity_func sf =
             [](const value_type &x,
                const value_type &y,
                double sigma) -> double  {
                 return (std::exp(-((x - y) * (x - y)) / (2 * sigma * sigma)));
-            },
-        seed_t seed = seed_t(-1))
+            })
         : iter_num_(num_of_iter),
           seed_(seed),
           sigma_(sigma),

diff --git a/include/DataFrame/Internals/DataFrame_slice.tcc b/include/DataFrame/Internals/DataFrame_slice.tcc
@@ -2712,6 +2712,105 @@ get_view_by_kmeans(const char *col_name,
 
 // ----------------------------------------------------------------------------
 
+template<typename I, typename H>
+template<std::size_t K, arithmetic T, typename ... Ts>
+std::array<DataFrame<I, HeteroVector<std::size_t(H::align_value)>>, K>
+DataFrame<I, H>::
+get_data_by_spectral(const char *col_name,
+                     double sigma,
+                     seed_t seed,
+                     std::function<double(const T &x, const T &y,
+                                          double sigma)>  &&sfunc,
+                     size_type num_of_iter) const  {
+
+    using df_t = DataFrame<I, HeteroVector<std::size_t(H::align_value)>>;
+    using res_t = std::array<df_t, K>;
+    using scv_t = spect_v<K, T, I, std::size_t(H::align_value)>;
+
+    const ColumnVecType<T>  &vec = get_column<T>(col_name);
+    scv_t                   spectral { num_of_iter, sigma, seed, sfunc };
+
+    spectral.pre();
+    spectral(indices_.begin(), indices_.end(), vec.begin(), vec.end());
+    spectral.post();
+
+    const auto  &idxs_arr = spectral.get_clusters_idxs();
+    res_t       result;
+
+    for (size_type i = 0; i < K; ++i)
+        result[i] = data_by_sel_common_<Ts ...>(idxs_arr[i], indices_.size());
+
+    return (result);
+}
+
+// ----------------------------------------------------------------------------
+
+template<typename I, typename H>
+template<std::size_t K, arithmetic T, typename ... Ts>
+std::array<typename DataFrame<I, H>::PtrView, K>
+DataFrame<I, H>::
+get_view_by_spectral(const char *col_name,
+                     double sigma,
+                     seed_t seed,
+                     std::function<double(const T &x, const T &y,
+                                          double sigma)>  &&sfunc,
+                     size_type num_of_iter)  {
+
+    using df_t = typename DataFrame<I, H>::PtrView;
+    using res_t = std::array<df_t, K>;
+    using scv_t = spect_v<K, T, I, std::size_t(H::align_value)>;
+
+    const ColumnVecType<T>  &vec = get_column<T>(col_name);
+    scv_t                   spectral { num_of_iter, sigma, seed, sfunc };
+
+    spectral.pre();
+    spectral(indices_.begin(), indices_.end(), vec.begin(), vec.end());
+    spectral.post();
+
+    const auto  &idxs_arr = spectral.get_clusters_idxs();
+    res_t       result;
+
+    for (size_type i = 0; i < K; ++i)
+        result[i] = view_by_sel_common_<Ts ...>(idxs_arr[i], indices_.size());
+
+    return (result);
+}
+
+// ----------------------------------------------------------------------------
+
+template<typename I, typename H>
+template<std::size_t K, arithmetic T, typename ... Ts>
+std::array<typename DataFrame<I, H>::ConstPtrView, K>
+DataFrame<I, H>::
+get_view_by_spectral(const char *col_name,
+                     double sigma,
+                     seed_t seed,
+                     std::function<double(const T &x, const T &y,
+                                          double sigma)>  &&sfunc,
+                     size_type num_of_iter) const  {
+
+    using df_t = typename DataFrame<I, H>::ConstPtrView;
+    using res_t = std::array<df_t, K>;
+    using scv_t = spect_v<K, T, I, std::size_t(H::align_value)>;
+
+    const ColumnVecType<T>  &vec = get_column<T>(col_name);
+    scv_t                   spectral { num_of_iter, sigma, seed, sfunc };
+
+    spectral.pre();
+    spectral(indices_.begin(), indices_.end(), vec.begin(), vec.end());
+    spectral.post();
+
+    const auto  &idxs_arr = spectral.get_clusters_idxs();
+    res_t       result;
+
+    for (size_type i = 0; i < K; ++i)
+        result[i] = view_by_sel_common_<Ts ...>(idxs_arr[i], indices_.size());
+
+    return (result);
+}
+
+// ----------------------------------------------------------------------------
+
 template<typename I, typename H>
 template<arithmetic T, typename ... Ts>
 std::vector<DataFrame<I, HeteroVector<std::size_t(H::align_value)>>>

diff --git a/include/DataFrame/Utils/Threads/ThreadPool.tcc b/include/DataFrame/Utils/Threads/ThreadPool.tcc
@@ -201,7 +201,8 @@ ThreadPool::parallel_loop(I begin, I end, F &&routine, As && ... args)  {
                     ((i - block_size) <= 0) ? 0 : i - block_size
                 };
 
-                if (((end + i) - (end + block_end + 1)) < (block_size - 1))
+                if (size_type((end + i) - (end + block_end + 1)) <
+                        (block_size - 1))
                     block_end = -1;
                 ret.emplace_back(dispatch(false,
                                           std::forward<F>(routine),