Skip to content

Commit

Permalink
Implemented get_[data|view]_by_spectral() + fixed a bug in SpectralCl…
Browse files Browse the repository at this point in the history
…usteringVisitor
  • Loading branch information
hosseinmoein committed Jan 6, 2025
1 parent 0d937aa commit 76e93d1
Show file tree
Hide file tree
Showing 8 changed files with 533 additions and 63 deletions.
4 changes: 4 additions & 0 deletions docs/HTML/DataFrame.html
Original file line number Diff line number Diff line change
Expand Up @@ -429,6 +429,10 @@ <H2 ID="2"><font color="blue">API Reference with code samples <font size="+4">&#
<td title="Gets data or view by selection"><a href="https://htmlpreview.github.io/?https://github.com/hosseinmoein/DataFrame/blob/master/docs/HTML/get_data_by_sel.html">get_data_by_sel( 5 )<BR>get_view_by_sel( 5 )</a></td>
</tr>

<tr class="item" onmouseover="this.style.backgroundColor='#ffff66';" onmouseout="this.style.backgroundColor='#d4e3e5';">
<td title="Gets data or views by applying spectral clustering algorithm"><a href="https://htmlpreview.github.io/?https://github.com/hosseinmoein/DataFrame/blob/master/docs/HTML/get_data_by_spectral.html">get_data_by_spectral()<BR>get_view_by_spectral()</a></td>
</tr>

<tr class="item" onmouseover="this.style.backgroundColor='#ffff66';" onmouseout="this.style.backgroundColor='#d4e3e5';">
<td title="Gets data or view by standard deviation"><a href="https://htmlpreview.github.io/?https://github.com/hosseinmoein/DataFrame/blob/master/docs/HTML/get_data_by_stdev.html">get_data_by_stdev()<BR>get_view_by_stdev()</a></td>
</tr>
Expand Down
49 changes: 25 additions & 24 deletions docs/HTML/SpectralClusteringVisitor.html

Large diffs are not rendered by default.

218 changes: 218 additions & 0 deletions docs/HTML/get_data_by_spectral.html

Large diffs are not rendered by default.

75 changes: 75 additions & 0 deletions include/DataFrame/DataFrame.h
Original file line number Diff line number Diff line change
Expand Up @@ -3331,6 +3331,81 @@ class DataFrame : public ThreadGranularity {
size_type num_of_iter = 1000,
seed_t seed = seed_t(-1)) const;

// This uses spectral clustering algorithm to divide the named column into
// K clusters. It returns an array of K DataFrame's each containing one of
// the clusters of data based on the named column.
// Self in unchanged.
//
// NOTE: Type T must support arithmetic operations
//
// K:
// Number of clusters for k-means clustering algorithm
// T:
// Type of the named column
// Ts:
// List all the types of all data columns. A type should be specified in
// the list only once.
// col_name:
// Name of the given column
// sfunc:
// A function to calculate the similarity matrix between data points in
// the named column
// num_of_iter:
// Maximum number of iterations for k-means clustering algorithm before
// converging
// seed:
// Seed for random number generator to initialize k-means clustering
// algorithm. Default is a random numbers for each call.
//
template<std::size_t K, arithmetic T, typename ... Ts>
[[nodiscard]]
std::array<DataFrame<I, HeteroVector<std::size_t(H::align_value)>>, K>
get_data_by_spectral(const char *col_name,
double sigma,
seed_t seed = seed_t(-1),
std::function<double(const T &x, const T &y,
double sigma)> &&sfunc =
[](const T &x, const T &y,
double sigma) -> double {
return (std::exp(-((x - y) * (x - y)) /
(2 * sigma * sigma)));
},
size_type num_of_iter = 1000) const;

// Same as above but it returns an array of Views.
//
template<std::size_t K, arithmetic T, typename ... Ts>
[[nodiscard]]
std::array<PtrView, K>
get_view_by_spectral(const char *col_name,
double sigma,
seed_t seed = seed_t(-1),
std::function<double(const T &x, const T &y,
double sigma)> &&sfunc =
[](const T &x, const T &y,
double sigma) -> double {
return (std::exp(-((x - y) * (x - y)) /
(2 * sigma * sigma)));
},
size_type num_of_iter = 1000);

// Same as above but it returns an array of const Views.
//
template<std::size_t K, arithmetic T, typename ... Ts>
[[nodiscard]]
std::array<ConstPtrView, K>
get_view_by_spectral(const char *col_name,
double sigma,
seed_t seed = seed_t(-1),
std::function<double(const T &x, const T &y,
double sigma)> &&sfunc =
[](const T &x, const T &y,
double sigma) -> double {
return (std::exp(-((x - y) * (x - y)) /
(2 * sigma * sigma)));
},
size_type num_of_iter = 1000) const;

// This uses Affinity Propagation algorithm to divide the named column
// into clusters. It returns an array of DataFrame's each containing one
// of the clusters of data based on the named column. Unlike K-Means
Expand Down
26 changes: 14 additions & 12 deletions include/DataFrame/DataFrameMLVisitors.h
Original file line number Diff line number Diff line change
Expand Up @@ -2799,7 +2799,6 @@ struct SpectralClusteringVisitor {
do_kmeans_(const mat_t &eigenvecs) {

const long rows = eigenvecs.rows(); // Samples
const long cols = eigenvecs.cols(); // dimensions
vec_t<long> cluster_idxs (rows, -1L);
constexpr long k = long(K);

Expand All @@ -2810,11 +2809,14 @@ struct SpectralClusteringVisitor {

// Copy the top k rows of eigen vector.
//
mat_t means { k, cols };
mat_t means { k, k };

for (long r = 0; r < k; ++r)
for (long c = 0; c < cols; ++c)
means(r, c) = eigenvecs(r, c);
for (long r = 0; r < k; ++r) {
const auto rr = rd_gen(gen);

for (long c = 0; c < k; ++c)
means(r, c) = eigenvecs(rr, c);
}

for (size_type iter = 0; iter < iter_num_; ++iter) {
// Assign cluster_idxs based on closest means
Expand All @@ -2824,7 +2826,7 @@ struct SpectralClusteringVisitor {

for (long rr = 0; rr < k; ++rr) {
const double distance =
distance_func_(eigenvecs, means, r, rr, cols);
distance_func_(eigenvecs, means, r, rr, k);

if (distance < best_distance) {
best_distance = distance;
Expand All @@ -2835,24 +2837,24 @@ struct SpectralClusteringVisitor {

// Update means
//
mat_t new_means { k, cols };
mat_t new_means { k, k };
vec_t<long> counts (k, 0L);

for (long r = 0; r < rows; ++r) {
for (long c = 0; c < cols; ++c)
for (long c = 0; c < k; ++c)
new_means(cluster_idxs[r], c) += eigenvecs(r, c);
counts[cluster_idxs[r]]++;
}

for (int r = 0; r < k; ++r) {
if (counts[r] > 0) {
for (long c = 0; c < cols; ++c)
for (long c = 0; c < k; ++c)
new_means(r, c) /= T(counts[r]);
}
else { // Reinitialize centroid if no points assigned
const auto rr = rd_gen(gen);

for (long c = 0; c < cols; ++c)
for (long c = 0; c < k; ++c)
new_means(r, c) = eigenvecs(rr, c);
}
}
Expand Down Expand Up @@ -2920,13 +2922,13 @@ struct SpectralClusteringVisitor {
SpectralClusteringVisitor(
size_type num_of_iter,
double sigma,
seed_t seed = seed_t(-1),
similarity_func sf =
[](const value_type &x,
const value_type &y,
double sigma) -> double {
return (std::exp(-((x - y) * (x - y)) / (2 * sigma * sigma)));
},
seed_t seed = seed_t(-1))
})
: iter_num_(num_of_iter),
seed_(seed),
sigma_(sigma),
Expand Down
99 changes: 99 additions & 0 deletions include/DataFrame/Internals/DataFrame_slice.tcc
Original file line number Diff line number Diff line change
Expand Up @@ -2712,6 +2712,105 @@ get_view_by_kmeans(const char *col_name,

// ----------------------------------------------------------------------------

template<typename I, typename H>
template<std::size_t K, arithmetic T, typename ... Ts>
std::array<DataFrame<I, HeteroVector<std::size_t(H::align_value)>>, K>
DataFrame<I, H>::
get_data_by_spectral(const char *col_name,
double sigma,
seed_t seed,
std::function<double(const T &x, const T &y,
double sigma)> &&sfunc,
size_type num_of_iter) const {

using df_t = DataFrame<I, HeteroVector<std::size_t(H::align_value)>>;
using res_t = std::array<df_t, K>;
using scv_t = spect_v<K, T, I, std::size_t(H::align_value)>;

const ColumnVecType<T> &vec = get_column<T>(col_name);
scv_t spectral { num_of_iter, sigma, seed, sfunc };

spectral.pre();
spectral(indices_.begin(), indices_.end(), vec.begin(), vec.end());
spectral.post();

const auto &idxs_arr = spectral.get_clusters_idxs();
res_t result;

for (size_type i = 0; i < K; ++i)
result[i] = data_by_sel_common_<Ts ...>(idxs_arr[i], indices_.size());

return (result);
}

// ----------------------------------------------------------------------------

template<typename I, typename H>
template<std::size_t K, arithmetic T, typename ... Ts>
std::array<typename DataFrame<I, H>::PtrView, K>
DataFrame<I, H>::
get_view_by_spectral(const char *col_name,
double sigma,
seed_t seed,
std::function<double(const T &x, const T &y,
double sigma)> &&sfunc,
size_type num_of_iter) {

using df_t = typename DataFrame<I, H>::PtrView;
using res_t = std::array<df_t, K>;
using scv_t = spect_v<K, T, I, std::size_t(H::align_value)>;

const ColumnVecType<T> &vec = get_column<T>(col_name);
scv_t spectral { num_of_iter, sigma, seed, sfunc };

spectral.pre();
spectral(indices_.begin(), indices_.end(), vec.begin(), vec.end());
spectral.post();

const auto &idxs_arr = spectral.get_clusters_idxs();
res_t result;

for (size_type i = 0; i < K; ++i)
result[i] = view_by_sel_common_<Ts ...>(idxs_arr[i], indices_.size());

return (result);
}

// ----------------------------------------------------------------------------

template<typename I, typename H>
template<std::size_t K, arithmetic T, typename ... Ts>
std::array<typename DataFrame<I, H>::ConstPtrView, K>
DataFrame<I, H>::
get_view_by_spectral(const char *col_name,
double sigma,
seed_t seed,
std::function<double(const T &x, const T &y,
double sigma)> &&sfunc,
size_type num_of_iter) const {

using df_t = typename DataFrame<I, H>::ConstPtrView;
using res_t = std::array<df_t, K>;
using scv_t = spect_v<K, T, I, std::size_t(H::align_value)>;

const ColumnVecType<T> &vec = get_column<T>(col_name);
scv_t spectral { num_of_iter, sigma, seed, sfunc };

spectral.pre();
spectral(indices_.begin(), indices_.end(), vec.begin(), vec.end());
spectral.post();

const auto &idxs_arr = spectral.get_clusters_idxs();
res_t result;

for (size_type i = 0; i < K; ++i)
result[i] = view_by_sel_common_<Ts ...>(idxs_arr[i], indices_.size());

return (result);
}

// ----------------------------------------------------------------------------

template<typename I, typename H>
template<arithmetic T, typename ... Ts>
std::vector<DataFrame<I, HeteroVector<std::size_t(H::align_value)>>>
Expand Down
3 changes: 2 additions & 1 deletion include/DataFrame/Utils/Threads/ThreadPool.tcc
Original file line number Diff line number Diff line change
Expand Up @@ -201,7 +201,8 @@ ThreadPool::parallel_loop(I begin, I end, F &&routine, As && ... args) {
((i - block_size) <= 0) ? 0 : i - block_size
};

if (((end + i) - (end + block_end + 1)) < (block_size - 1))
if (size_type((end + i) - (end + block_end + 1)) <
(block_size - 1))
block_end = -1;
ret.emplace_back(dispatch(false,
std::forward<F>(routine),
Expand Down
Loading

0 comments on commit 76e93d1

Please sign in to comment.