Skip to content

Commit

Permalink
Implemented canon_corr()
Browse files Browse the repository at this point in the history
  • Loading branch information
hosseinmoein committed Jan 11, 2025
1 parent 6705f01 commit 4de1767
Show file tree
Hide file tree
Showing 10 changed files with 366 additions and 8 deletions.
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -64,7 +64,7 @@ Each program has three identical parts. First it generates and populates 3 colum
The maximum dataset I could load into Polars was 300m rows per column. Any bigger dataset blew up the memory and caused OS to kill it. I ran C++ DataFrame with 10b rows per column and I am sure it would have run with bigger datasets too. So, I was forced to run both with 300m rows to compare.
I ran each test 4 times and took the best time. Polars numbers varied a lot from one run to another, especially calculation and selection times. C++ DataFrame numbers were significantly more consistent.

| | [<B>C++ DataFrame</B>](https://github.com/hosseinmoein/DataFrame/blob/master/benchmarks/dataframe_performance.cc) | [<B>Polars</B>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;](https://github.com/hosseinmoein/DataFrame/blob/master/benchmarks/polars_performance.py) | [<B>Pandas</B>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;](https://github.com/hosseinmoein/DataFrame/blob/master/benchmarks/pandas_performance.py) |
| | [<B>C++ DataFrame</B>](https://github.com/hosseinmoein/DataFrame/blob/master/benchmarks/dataframe_performance.cc) | [&nbsp;&nbsp;&nbsp<B>Polars</B>&nbsp;&nbsp;&nbsp;&nbsp;](https://github.com/hosseinmoein/DataFrame/blob/master/benchmarks/polars_performance.py) | [&nbsp;&nbsp;&nbsp<B>Pandas</B>&nbsp;&nbsp;&nbsp;&nbsp;](https://github.com/hosseinmoein/DataFrame/blob/master/benchmarks/pandas_performance.py) |
| :-- | ---: | ---: | ---: |
| Data generation/load time | 26.9459 secs | 28.4686 secs | 36.6799 secs |
| Calculation time | 1.2602 secs | 4.8766 secs | 40.3264 secs |
Expand Down
4 changes: 4 additions & 0 deletions docs/HTML/DataFrame.html
Original file line number Diff line number Diff line change
Expand Up @@ -253,6 +253,10 @@ <H2 ID="2"><font color="blue">API Reference with code samples <font size="+4">&#
<td title="These are used to get information from data" style="text-align:center;background-color:LightGrey;color:DarkBlue">Getting Information &nbsp;&nbsp; <font size="+3">&#128129;</font></td>
</tr>

<tr class="item" onmouseover="this.style.backgroundColor='#ffff66';" onmouseout="this.style.backgroundColor='#d4e3e5';">
<td title="Performs canonical correlation analysis between two sets of columns"><a href="https://htmlpreview.github.io/?https://github.com/hosseinmoein/DataFrame/blob/master/docs/HTML/canon_corr.html">canon_corr</a>()</td>
</tr>

<tr class="item" onmouseover="this.style.backgroundColor='#ffff66';" onmouseout="this.style.backgroundColor='#d4e3e5';">
<td title="Get column index for the given column name"><a href="https://htmlpreview.github.io/?https://github.com/hosseinmoein/DataFrame/blob/master/docs/HTML/col_name_to_idx.html">col_name_to_idx</a>()</td>
</tr>
Expand Down
142 changes: 142 additions & 0 deletions docs/HTML/canon_corr.html
Original file line number Diff line number Diff line change
@@ -0,0 +1,142 @@
<!--
Copyright (c) 2019-2026, Hossein Moein
All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:
* Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above copyright
notice, this list of conditions and the following disclaimer in the
documentation and/or other materials provided with the distribution.
* Neither the name of Hossein Moein and/or the DataFrame nor the
names of its contributors may be used to endorse or promote products
derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
DISCLAIMED. IN NO EVENT SHALL Hossein Moein BE LIABLE FOR ANY
DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-->
<!DOCTYPE html>
<html>

<head>
<style>
body {
background-image: linear-gradient(Azure, AliceBlue, GhostWhite, WhiteSmoke);
}

a { color: #FF5555; }
</style>
</head>

<body style="font-family: Georgia, serif">
<font size="+3">&#8592;</font> <a href="https://htmlpreview.github.io/?https://github.com/hosseinmoein/DataFrame/blob/master/docs/HTML/DataFrame.html">Back to Documentations</a><BR><BR>

<table border="1">

<tr bgcolor="lightblue">
<th>Signature</th> <th>Description</th>
</tr>

<tr bgcolor="Azure">
<td bgcolor="blue"> <font color="white">
<PRE><B>
template&lt;typename T&gt;
struct CanonCorrResult {

// These values represent the strength of the linear relationship between
// each pair of canonical variates, ranging from -1 to 1, with higher
// absolute values signifying a stronger association.
//
std::vector&lt;T&gt; coeffs { }; // Canonical correlation coefficients

// The Redundancy Index is a measure that indicates how much variance in
// one set of variables is explained by the linear combination of the other
// set of variables. This was proposed by Stewart and Love (1968).
//
T x_red_idx { }; // Redundancy index for X
T y_red_idx { }; // Redundancy index for Y
};
</font>
</B></PRE>
</td>
<td>
Result of Canonical Correlation Analysis as returned by canon_corr() interface<BR>
</td>
</tr>

</table>

<BR>
<table border="1">

<tr bgcolor="lightblue">
<th>Signature</th> <th>Description</th> <th>Parameters</th>
</tr>

<tr bgcolor="Azure">
<td bgcolor="blue"> <font color="white">
<PRE><B>
template&lt;typename T&gt;
CanonCorrResult&lt;T&gt;
canon_corr(std::vector&lt;const char *&gt; &amp;&amp;X_col_names,
std::vector&lt;const char *&gt; &amp;&amp;Y_col_names) const;
</B></PRE></font>
</td>
<td>
This performs Canonical Correlation Analysis (CCA) between two sets of columns <I>X</I> and <I>Y</I>. It returns the result in a struct defined above.<BR>
CCA is a statistical method for examining and measuring correlations between two sets of variables. Fundamentally, CCA looks for linear combinations of variables, also referred to as canonical variables, within each set so that the correlation between them is maximized. Finding relationships and patterns of linkage between the two groups is the main objective.<BR><BR>

<B>NOTE</B>: Number of columns in each set must be the same<BR>
</td>
<td width="28%">
<B>T</B>: Type of the named columns<BR>
<B>X_col_names</B>: Names of the first set of columns<BR>
<B>Y_col_names</B>: Names of the second set of columns<BR>
</td>
</tr>

</table>

<pre class="code_syntax" style="color:#000000;background:#ffffff00;"><span class="line_wrapper"><span style="color:#800000; font-weight:bold; ">static</span> <span style="color:#800000; font-weight:bold; ">void</span> test_canon_corr<span style="color:#808030; ">(</span><span style="color:#808030; ">)</span> <span style="color:#800080; ">{</span></span>
<span class="line_wrapper"></span>
<span class="line_wrapper"> <span style="color:#666616; ">std</span><span style="color:#800080; ">::</span><span style="color:#603000; ">cout</span> <span style="color:#808030; ">&lt;</span><span style="color:#808030; ">&lt;</span> <span style="color:#800000; ">"</span><span style="color:#0f69ff; ">\n</span><span style="color:#0000e6; ">Testing canon_corr( ) ...</span><span style="color:#800000; ">"</span> <span style="color:#808030; ">&lt;</span><span style="color:#808030; ">&lt;</span> <span style="color:#666616; ">std</span><span style="color:#800080; ">::</span><span style="color:#603000; ">endl</span><span style="color:#800080; ">;</span></span>
<span class="line_wrapper"></span>
<span class="line_wrapper"> StrDataFrame df<span style="color:#800080; ">;</span></span>
<span class="line_wrapper"></span>
<span class="line_wrapper"> <span style="color:#800000; font-weight:bold; ">try</span> <span style="color:#800080; ">{</span></span>
<span class="line_wrapper"> df<span style="color:#808030; ">.</span><span style="color:#603000; ">read</span><span style="color:#808030; ">(</span><span style="color:#800000; ">"</span><span style="color:#0000e6; ">IBM.csv</span><span style="color:#800000; ">"</span><span style="color:#808030; ">,</span> io_format<span style="color:#800080; ">::</span>csv2<span style="color:#808030; ">)</span><span style="color:#800080; ">;</span></span>
<span class="line_wrapper"> <span style="color:#800080; ">}</span></span>
<span class="line_wrapper"> <span style="color:#800000; font-weight:bold; ">catch</span> <span style="color:#808030; ">(</span><span style="color:#800000; font-weight:bold; ">const</span> DataFrameError <span style="color:#808030; ">&amp;</span>ex<span style="color:#808030; ">)</span> <span style="color:#800080; ">{</span></span>
<span class="line_wrapper"> <span style="color:#666616; ">std</span><span style="color:#800080; ">::</span><span style="color:#603000; ">cout</span> <span style="color:#808030; ">&lt;</span><span style="color:#808030; ">&lt;</span> ex<span style="color:#808030; ">.</span>what<span style="color:#808030; ">(</span><span style="color:#808030; ">)</span> <span style="color:#808030; ">&lt;</span><span style="color:#808030; ">&lt;</span> <span style="color:#666616; ">std</span><span style="color:#800080; ">::</span><span style="color:#603000; ">endl</span><span style="color:#800080; ">;</span></span>
<span class="line_wrapper"> <span style="color:#800080; ">}</span></span>
<span class="line_wrapper"></span>
<span class="line_wrapper"> <span style="color:#800000; font-weight:bold; ">const</span> <span style="color:#800000; font-weight:bold; ">auto</span> result <span style="color:#808030; ">=</span> df<span style="color:#808030; ">.</span>canon_corr<span style="color:#800080; ">&lt;</span><span style="color:#800000; font-weight:bold; ">double</span><span style="color:#800080; ">&gt;</span><span style="color:#808030; ">(</span><span style="color:#800080; ">{</span> <span style="color:#800000; ">"</span><span style="color:#0000e6; ">IBM_Close</span><span style="color:#800000; ">"</span><span style="color:#808030; ">,</span> <span style="color:#800000; ">"</span><span style="color:#0000e6; ">IBM_Open</span><span style="color:#800000; ">"</span> <span style="color:#800080; ">}</span><span style="color:#808030; ">,</span> <span style="color:#800080; ">{</span> <span style="color:#800000; ">"</span><span style="color:#0000e6; ">IBM_High</span><span style="color:#800000; ">"</span><span style="color:#808030; ">,</span> <span style="color:#800000; ">"</span><span style="color:#0000e6; ">IBM_Low</span><span style="color:#800000; ">"</span> <span style="color:#800080; ">}</span><span style="color:#808030; ">)</span><span style="color:#800080; ">;</span></span>
<span class="line_wrapper"></span>
<span class="line_wrapper"> assert<span style="color:#808030; ">(</span>result<span style="color:#808030; ">.</span>coeffs<span style="color:#808030; ">.</span>size<span style="color:#808030; ">(</span><span style="color:#808030; ">)</span> <span style="color:#808030; ">=</span><span style="color:#808030; ">=</span> <span style="color:#008c00; ">2</span><span style="color:#808030; ">)</span><span style="color:#800080; ">;</span></span>
<span class="line_wrapper"> assert<span style="color:#808030; ">(</span><span style="color:#666616; ">std</span><span style="color:#800080; ">::</span><span style="color:#603000; ">fabs</span><span style="color:#808030; ">(</span>result<span style="color:#808030; ">.</span>coeffs<span style="color:#808030; ">[</span><span style="color:#008c00; ">0</span><span style="color:#808030; ">]</span> <span style="color:#808030; ">-</span> <span style="color:#008000; ">0.999944</span><span style="color:#808030; ">)</span> <span style="color:#808030; ">&lt;</span> <span style="color:#008000; ">0.000001</span><span style="color:#808030; ">)</span><span style="color:#800080; ">;</span></span>
<span class="line_wrapper"> assert<span style="color:#808030; ">(</span><span style="color:#666616; ">std</span><span style="color:#800080; ">::</span><span style="color:#603000; ">fabs</span><span style="color:#808030; ">(</span>result<span style="color:#808030; ">.</span>coeffs<span style="color:#808030; ">[</span><span style="color:#008c00; ">1</span><span style="color:#808030; ">]</span> <span style="color:#808030; ">-</span> <span style="color:#008000; ">0.262927</span><span style="color:#808030; ">)</span> <span style="color:#808030; ">&lt;</span> <span style="color:#008000; ">0.000001</span><span style="color:#808030; ">)</span><span style="color:#800080; ">;</span></span>
<span class="line_wrapper"> assert<span style="color:#808030; ">(</span><span style="color:#666616; ">std</span><span style="color:#800080; ">::</span><span style="color:#603000; ">fabs</span><span style="color:#808030; ">(</span>result<span style="color:#808030; ">.</span>x_red_idx <span style="color:#808030; ">-</span> <span style="color:#008000; ">0.534073</span><span style="color:#808030; ">)</span> <span style="color:#808030; ">&lt;</span> <span style="color:#008000; ">0.000001</span><span style="color:#808030; ">)</span><span style="color:#800080; ">;</span></span>
<span class="line_wrapper"> assert<span style="color:#808030; ">(</span><span style="color:#666616; ">std</span><span style="color:#800080; ">::</span><span style="color:#603000; ">fabs</span><span style="color:#808030; ">(</span>result<span style="color:#808030; ">.</span>y_red_idx <span style="color:#808030; ">-</span> <span style="color:#008000; ">0.535897</span><span style="color:#808030; ">)</span> <span style="color:#808030; ">&lt;</span> <span style="color:#008000; ">0.000001</span><span style="color:#808030; ">)</span><span style="color:#800080; ">;</span></span>
<span class="line_wrapper"><span style="color:#800080; ">}</span></span>
<span class="line_wrapper"></span></pre>

<BR><img src="https://github.com/hosseinmoein/DataFrame/blob/master/docs/LionLookingUp.jpg?raw=true" alt="C++ DataFrame"
width="200" height="200" style="float:right"/>

</body>
</html>

<!--
Local Variables:
mode:HTML
tab-width:4
c-basic-offset:4
End:
-->
24 changes: 24 additions & 0 deletions include/DataFrame/DataFrame.h
Original file line number Diff line number Diff line change
Expand Up @@ -3891,6 +3891,30 @@ class DataFrame : public ThreadGranularity {
normalization_type norm_type =
normalization_type::z_score) const;

// This performs Canonical Correlation Analysis (CCA) between two sets of
// columns // X and Y. It returns the result in a struct defined above.
//
// CCA is a statistical method for examining and measuring correlations
// between two sets of variables. Fundamentally, CCA looks for linear
// combinations of variables, also referred to as canonical variables,
// within each set so that the correlation between them is maximized.
// Finding relationships and patterns of linkage between the two groups
// is the main objective.
//
// NOTE: Number of columns in each set must be the same
//
// T:
// Type of the named columns
// X_col_names:
// Names of the first set of columns
// Y_col_names:
// Names of the second set of columns
//
template<typename T>
[[nodiscard]] CanonCorrResult<T>
canon_corr(std::vector<const char *> &&X_col_names,
std::vector<const char *> &&Y_col_names) const;

// This function returns a DataFrame indexed by std::string that provides
// a few statistics about the columns of the calling DataFrame.
// The statistics are:
Expand Down
21 changes: 21 additions & 0 deletions include/DataFrame/DataFrameTypes.h
Original file line number Diff line number Diff line change
Expand Up @@ -733,6 +733,27 @@ struct PCAParams {

// ----------------------------------------------------------------------------

// Canonical correlation analysis result
//
template<typename T>
struct CanonCorrResult {

// These values represent the strength of the linear relationship between
// each pair of canonical variates, ranging from -1 to 1, with higher
// absolute values signifying a stronger association.
//
std::vector<T> coeffs { }; // Canonical correlation coefficients

// The Redundancy Index is a measure that indicates how much variance in
// one set of variables is explained by the linear combination of the other
// set of variables. This was proposed by Stewart and Love (1968).
//
T x_red_idx { }; // Redundancy index for X
T y_red_idx { }; // Redundancy index for Y
};

// ----------------------------------------------------------------------------

template<typename T>
struct RandGenParams {

Expand Down
90 changes: 90 additions & 0 deletions include/DataFrame/Internals/DataFrame_get.tcc
Original file line number Diff line number Diff line change
Expand Up @@ -1052,6 +1052,96 @@ compact_svd(std::vector<const char *> &&col_names,
return (std::make_tuple(U, S, V));
}

// ----------------------------------------------------------------------------

template<typename I, typename H>
template<typename T>
CanonCorrResult<T> DataFrame<I, H>::
canon_corr(std::vector<const char *> &&X_col_names,
std::vector<const char *> &&Y_col_names) const {

using col_mat_t = Matrix<T, matrix_orient::column_major>;

#ifdef HMDF_SANITY_EXCEPTIONS
if (X_col_names.size() != Y_col_names.size())
throw NotFeasible("canon_corr(): "
"Two sets must have same number of variables");
#endif // HMDF_SANITY_EXCEPTIONS

size_type min_col_s { indices_.size() };
std::vector<const ColumnVecType<T> *> columns
(X_col_names.size() + Y_col_names.size(), nullptr);
SpinGuard guard { lock_ };

for (size_type i { 0 }; i < X_col_names.size(); ++i) {
columns[i] = &get_column<T>(X_col_names[i], false);
if (columns[i]->size() < min_col_s)
min_col_s = columns[i]->size();
}
for (size_type i { 0 }; i < Y_col_names.size(); ++i) {
const size_type idx = i + X_col_names.size();

columns[idx] = &get_column<T>(Y_col_names[i], false);
if (columns[idx]->size() < min_col_s)
min_col_s = columns[idx]->size();
}
guard.release();

col_mat_t X { long(min_col_s), long(X_col_names.size()) };

for (size_type i { 0 }; i < X_col_names.size(); ++i)
X.set_column(columns[i]->begin(), i);

col_mat_t Y { long(min_col_s), long(Y_col_names.size()) };

for (size_type i { 0 }; i < Y_col_names.size(); ++i)
Y.set_column(columns[i + X_col_names.size()]->begin(), i);

const auto XY_cov = _calc_centered_cov_(X, Y);
const auto X_cov = _calc_centered_cov_(X, X);
const auto Y_cov = _calc_centered_cov_(Y, Y);
const auto sq_root_mat =
X_cov.inverse() * XY_cov * Y_cov.inverse() * XY_cov.transpose();
col_mat_t U;
col_mat_t S;
col_mat_t V;

sq_root_mat.svd(U, S, V, false);

CanonCorrResult<T> result;

result.coeffs.reserve(S.rows());
for (long i { 0 }; i < S.rows(); ++i)
result.coeffs.push_back(S(i, 0));

T X_cov_diag_sum { 0 };
T Y_cov_diag_sum { 0 };

for (long i { 0 }; i < X_cov.rows(); ++i) {
X_cov_diag_sum += X_cov(i, i);
Y_cov_diag_sum += Y_cov(i, i);
}

T redun { 0 };

for (long i { 0 }; i < X_cov.rows(); ++i) {
const T S_val = S(i, 0);

redun += S_val * S_val * X_cov(i, i);
}
result.x_red_idx = redun / X_cov_diag_sum;

redun = 0;
for (long i { 0 }; i < Y_cov.rows(); ++i) {
const T S_val = S(i, 0);

redun += S_val * S_val * Y_cov(i, i);
}
result.y_red_idx = redun / Y_cov_diag_sum;

return (result);
}

} // namespace hmdf

// ----------------------------------------------------------------------------
Expand Down
Loading

0 comments on commit 4de1767

Please sign in to comment.