Implemented canon_corr()

hosseinmoein · Jan 11, 2025 · 4de1767 · 4de1767
1 parent 6705f01
commit 4de1767
Show file tree

Hide file tree

Showing 10 changed files with 366 additions and 8 deletions.
diff --git a/README.md b/README.md
@@ -64,7 +64,7 @@ Each program has three identical parts. First it generates and populates 3 colum
 The maximum dataset I could load into Polars was 300m rows per column. Any bigger dataset blew up the memory and caused OS to kill it. I ran C++ DataFrame with 10b rows per column and I am sure it would have run with bigger datasets too. So, I was forced to run both with 300m rows to compare.
 I ran each test 4 times and took the best time. Polars numbers varied a lot from one run to another, especially calculation and selection times. C++ DataFrame numbers were significantly more consistent.
 
-|                          | [<B>C++ DataFrame</B>](https://github.com/hosseinmoein/DataFrame/blob/master/benchmarks/dataframe_performance.cc) | [<B>Polars</B>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;](https://github.com/hosseinmoein/DataFrame/blob/master/benchmarks/polars_performance.py) | [<B>Pandas</B>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;](https://github.com/hosseinmoein/DataFrame/blob/master/benchmarks/pandas_performance.py) |
+|                          | [<B>C++ DataFrame</B>](https://github.com/hosseinmoein/DataFrame/blob/master/benchmarks/dataframe_performance.cc) | [&nbsp;&nbsp;&nbsp<B>Polars</B>&nbsp;&nbsp;&nbsp;&nbsp;](https://github.com/hosseinmoein/DataFrame/blob/master/benchmarks/polars_performance.py) | [&nbsp;&nbsp;&nbsp<B>Pandas</B>&nbsp;&nbsp;&nbsp;&nbsp;](https://github.com/hosseinmoein/DataFrame/blob/master/benchmarks/pandas_performance.py) |
 |            :--           |        ---:         |     ---:      |     ---:      |
 | Data generation/load time | 26.9459 secs | 28.4686 secs | 36.6799 secs |
 | Calculation time | 1.2602 secs | 4.8766 secs | 40.3264 secs |

diff --git a/docs/HTML/DataFrame.html b/docs/HTML/DataFrame.html
@@ -253,6 +253,10 @@ <H2 ID="2"><font color="blue">API Reference with code samples <font size="+4">&#
       <td title="These are used to get information from data" style="text-align:center;background-color:LightGrey;color:DarkBlue">Getting Information &nbsp;&nbsp; <font size="+3">&#128129;</font></td>
     </tr>
 
+    <tr class="item" onmouseover="this.style.backgroundColor='#ffff66';" onmouseout="this.style.backgroundColor='#d4e3e5';">
+      <td title="Performs canonical correlation analysis between two sets of columns"><a href="https://htmlpreview.github.io/?https://github.com/hosseinmoein/DataFrame/blob/master/docs/HTML/canon_corr.html">canon_corr</a>()</td>
+    </tr>
+
     <tr class="item" onmouseover="this.style.backgroundColor='#ffff66';" onmouseout="this.style.backgroundColor='#d4e3e5';">
       <td title="Get column index for the given column name"><a href="https://htmlpreview.github.io/?https://github.com/hosseinmoein/DataFrame/blob/master/docs/HTML/col_name_to_idx.html">col_name_to_idx</a>()</td>
     </tr>

diff --git a/docs/HTML/canon_corr.html b/docs/HTML/canon_corr.html
@@ -0,0 +1,142 @@
+<!--
+Copyright (c) 2019-2026, Hossein Moein
+All rights reserved.
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+* Redistributions of source code must retain the above copyright
+  notice, this list of conditions and the following disclaimer.
+* Redistributions in binary form must reproduce the above copyright
+  notice, this list of conditions and the following disclaimer in the
+  documentation and/or other materials provided with the distribution.
+* Neither the name of Hossein Moein and/or the DataFrame nor the
+  names of its contributors may be used to endorse or promote products
+  derived from this software without specific prior written permission.
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL Hossein Moein BE LIABLE FOR ANY
+DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+-->
+<!DOCTYPE html>
+<html>
+
+<head>
+<style>
+body {
+  background-image: linear-gradient(Azure, AliceBlue, GhostWhite, WhiteSmoke);
+}
+
+a { color: #FF5555; }
+</style>
+</head>
+
+<body style="font-family: Georgia, serif">
+  <font size="+3">&#8592;</font> <a href="https://htmlpreview.github.io/?https://github.com/hosseinmoein/DataFrame/blob/master/docs/HTML/DataFrame.html">Back to Documentations</a><BR><BR>
+
+  <table border="1">
+
+    <tr bgcolor="lightblue">
+      <th>Signature</th> <th>Description</th>
+    </tr>
+
+    <tr bgcolor="Azure">
+      <td bgcolor="blue"> <font color="white">
+        <PRE><B>
+template&lt;typename T&gt;
+struct  CanonCorrResult  {
+
+    // These values represent the strength of the linear relationship between
+    // each pair of canonical variates, ranging from -1 to 1, with higher
+    // absolute values signifying a stronger association.
+    //
+    std::vector&lt;T&gt;  coeffs { };     // Canonical correlation coefficients
+
+    // The Redundancy Index is a measure that indicates how much variance in
+    // one set of variables is explained by the linear combination of the other
+    // set of variables. This was proposed by Stewart and Love (1968).
+    //
+    T               x_red_idx { };  // Redundancy index for X
+    T               y_red_idx { };  // Redundancy index for Y
+};
+        </font>
+        </B></PRE>
+      </td>
+      <td>
+        Result of Canonical Correlation Analysis as returned by canon_corr() interface<BR>
+      </td>
+    </tr>
+
+  </table>
+
+  <BR>
+  <table border="1">
+
+    <tr bgcolor="lightblue">
+        <th>Signature</th> <th>Description</th> <th>Parameters</th>
+    </tr>
+
+    <tr bgcolor="Azure">
+      <td bgcolor="blue"> <font color="white">
+<PRE><B>
+template&lt;typename T&gt;
+CanonCorrResult&lt;T&gt;
+canon_corr(std::vector&lt;const char *&gt; &amp;&amp;X_col_names,
+           std::vector&lt;const char *&gt; &amp;&amp;Y_col_names) const;
+</B></PRE></font>
+      </td>
+      <td>
+        This performs Canonical Correlation Analysis (CCA) between two sets of columns <I>X</I> and <I>Y</I>. It returns the result in a struct defined above.<BR>
+        CCA is a statistical method for examining and measuring correlations between two sets of variables. Fundamentally, CCA looks for linear combinations of variables, also referred to as canonical variables, within each set so that the correlation between them is maximized. Finding relationships and patterns of linkage between the two groups is the main objective.<BR><BR>
+
+        <B>NOTE</B>: Number of columns in each set must be the same<BR>
+      </td>
+      <td width="28%">
+       <B>T</B>: Type of the named columns<BR>
+       <B>X_col_names</B>: Names of the first set of columns<BR>
+       <B>Y_col_names</B>: Names of the second set of columns<BR>
+      </td>
+    </tr>
+
+  </table>
+
+<pre class="code_syntax" style="color:#000000;background:#ffffff00;"><span class="line_wrapper"><span style="color:#800000; font-weight:bold; ">static</span> <span style="color:#800000; font-weight:bold; ">void</span> test_canon_corr<span style="color:#808030; ">(</span><span style="color:#808030; ">)</span>  <span style="color:#800080; ">{</span></span>
+<span class="line_wrapper"></span>
+<span class="line_wrapper">    <span style="color:#666616; ">std</span><span style="color:#800080; ">::</span><span style="color:#603000; ">cout</span> <span style="color:#808030; ">&lt;</span><span style="color:#808030; ">&lt;</span> <span style="color:#800000; ">"</span><span style="color:#0f69ff; ">\n</span><span style="color:#0000e6; ">Testing canon_corr( ) ...</span><span style="color:#800000; ">"</span> <span style="color:#808030; ">&lt;</span><span style="color:#808030; ">&lt;</span> <span style="color:#666616; ">std</span><span style="color:#800080; ">::</span><span style="color:#603000; ">endl</span><span style="color:#800080; ">;</span></span>
+<span class="line_wrapper"></span>
+<span class="line_wrapper">    StrDataFrame    df<span style="color:#800080; ">;</span></span>
+<span class="line_wrapper"></span>
+<span class="line_wrapper">    <span style="color:#800000; font-weight:bold; ">try</span>  <span style="color:#800080; ">{</span></span>
+<span class="line_wrapper">        df<span style="color:#808030; ">.</span><span style="color:#603000; ">read</span><span style="color:#808030; ">(</span><span style="color:#800000; ">"</span><span style="color:#0000e6; ">IBM.csv</span><span style="color:#800000; ">"</span><span style="color:#808030; ">,</span> io_format<span style="color:#800080; ">::</span>csv2<span style="color:#808030; ">)</span><span style="color:#800080; ">;</span></span>
+<span class="line_wrapper">    <span style="color:#800080; ">}</span></span>
+<span class="line_wrapper">    <span style="color:#800000; font-weight:bold; ">catch</span> <span style="color:#808030; ">(</span><span style="color:#800000; font-weight:bold; ">const</span> DataFrameError <span style="color:#808030; ">&amp;</span>ex<span style="color:#808030; ">)</span>  <span style="color:#800080; ">{</span></span>
+<span class="line_wrapper">        <span style="color:#666616; ">std</span><span style="color:#800080; ">::</span><span style="color:#603000; ">cout</span> <span style="color:#808030; ">&lt;</span><span style="color:#808030; ">&lt;</span> ex<span style="color:#808030; ">.</span>what<span style="color:#808030; ">(</span><span style="color:#808030; ">)</span> <span style="color:#808030; ">&lt;</span><span style="color:#808030; ">&lt;</span> <span style="color:#666616; ">std</span><span style="color:#800080; ">::</span><span style="color:#603000; ">endl</span><span style="color:#800080; ">;</span></span>
+<span class="line_wrapper">    <span style="color:#800080; ">}</span></span>
+<span class="line_wrapper"></span>
+<span class="line_wrapper">    <span style="color:#800000; font-weight:bold; ">const</span> <span style="color:#800000; font-weight:bold; ">auto</span>  result <span style="color:#808030; ">=</span> df<span style="color:#808030; ">.</span>canon_corr<span style="color:#800080; ">&lt;</span><span style="color:#800000; font-weight:bold; ">double</span><span style="color:#800080; ">&gt;</span><span style="color:#808030; ">(</span><span style="color:#800080; ">{</span> <span style="color:#800000; ">"</span><span style="color:#0000e6; ">IBM_Close</span><span style="color:#800000; ">"</span><span style="color:#808030; ">,</span> <span style="color:#800000; ">"</span><span style="color:#0000e6; ">IBM_Open</span><span style="color:#800000; ">"</span> <span style="color:#800080; ">}</span><span style="color:#808030; ">,</span> <span style="color:#800080; ">{</span> <span style="color:#800000; ">"</span><span style="color:#0000e6; ">IBM_High</span><span style="color:#800000; ">"</span><span style="color:#808030; ">,</span> <span style="color:#800000; ">"</span><span style="color:#0000e6; ">IBM_Low</span><span style="color:#800000; ">"</span> <span style="color:#800080; ">}</span><span style="color:#808030; ">)</span><span style="color:#800080; ">;</span></span>
+<span class="line_wrapper"></span>
+<span class="line_wrapper">    assert<span style="color:#808030; ">(</span>result<span style="color:#808030; ">.</span>coeffs<span style="color:#808030; ">.</span>size<span style="color:#808030; ">(</span><span style="color:#808030; ">)</span> <span style="color:#808030; ">=</span><span style="color:#808030; ">=</span> <span style="color:#008c00; ">2</span><span style="color:#808030; ">)</span><span style="color:#800080; ">;</span></span>
+<span class="line_wrapper">    assert<span style="color:#808030; ">(</span><span style="color:#666616; ">std</span><span style="color:#800080; ">::</span><span style="color:#603000; ">fabs</span><span style="color:#808030; ">(</span>result<span style="color:#808030; ">.</span>coeffs<span style="color:#808030; ">[</span><span style="color:#008c00; ">0</span><span style="color:#808030; ">]</span> <span style="color:#808030; ">-</span> <span style="color:#008000; ">0.999944</span><span style="color:#808030; ">)</span> <span style="color:#808030; ">&lt;</span> <span style="color:#008000; ">0.000001</span><span style="color:#808030; ">)</span><span style="color:#800080; ">;</span></span>
+<span class="line_wrapper">    assert<span style="color:#808030; ">(</span><span style="color:#666616; ">std</span><span style="color:#800080; ">::</span><span style="color:#603000; ">fabs</span><span style="color:#808030; ">(</span>result<span style="color:#808030; ">.</span>coeffs<span style="color:#808030; ">[</span><span style="color:#008c00; ">1</span><span style="color:#808030; ">]</span> <span style="color:#808030; ">-</span> <span style="color:#008000; ">0.262927</span><span style="color:#808030; ">)</span> <span style="color:#808030; ">&lt;</span> <span style="color:#008000; ">0.000001</span><span style="color:#808030; ">)</span><span style="color:#800080; ">;</span></span>
+<span class="line_wrapper">    assert<span style="color:#808030; ">(</span><span style="color:#666616; ">std</span><span style="color:#800080; ">::</span><span style="color:#603000; ">fabs</span><span style="color:#808030; ">(</span>result<span style="color:#808030; ">.</span>x_red_idx <span style="color:#808030; ">-</span> <span style="color:#008000; ">0.534073</span><span style="color:#808030; ">)</span> <span style="color:#808030; ">&lt;</span> <span style="color:#008000; ">0.000001</span><span style="color:#808030; ">)</span><span style="color:#800080; ">;</span></span>
+<span class="line_wrapper">    assert<span style="color:#808030; ">(</span><span style="color:#666616; ">std</span><span style="color:#800080; ">::</span><span style="color:#603000; ">fabs</span><span style="color:#808030; ">(</span>result<span style="color:#808030; ">.</span>y_red_idx <span style="color:#808030; ">-</span> <span style="color:#008000; ">0.535897</span><span style="color:#808030; ">)</span> <span style="color:#808030; ">&lt;</span> <span style="color:#008000; ">0.000001</span><span style="color:#808030; ">)</span><span style="color:#800080; ">;</span></span>
+<span class="line_wrapper"><span style="color:#800080; ">}</span></span>
+<span class="line_wrapper"></span></pre>
+
+  <BR><img src="https://github.com/hosseinmoein/DataFrame/blob/master/docs/LionLookingUp.jpg?raw=true" alt="C++ DataFrame"
+       width="200" height="200" style="float:right"/>
+
+</body>
+</html>
+
+<!--
+Local Variables:
+mode:HTML
+tab-width:4
+c-basic-offset:4
+End:
+-->
diff --git a/include/DataFrame/DataFrame.h b/include/DataFrame/DataFrame.h
@@ -3891,6 +3891,30 @@ class   DataFrame : public ThreadGranularity {
                 normalization_type norm_type =
                     normalization_type::z_score) const;
 
+    // This performs Canonical Correlation Analysis (CCA) between two sets of
+    // columns // X and Y. It returns the result in a struct defined above.
+    //
+    // CCA is a statistical method for examining and measuring correlations
+    // between two sets of variables. Fundamentally, CCA looks for linear
+    // combinations of variables, also referred to as canonical variables,
+    // within each set so that the correlation between them is maximized.
+    // Finding relationships and patterns of linkage between the two groups
+    // is the main objective.
+    //
+    // NOTE: Number of columns in each set must be the same
+    //
+    // T:
+    //   Type of the named columns
+    // X_col_names:
+    //   Names of the first set of columns
+    // Y_col_names:
+    //   Names of the second set of columns
+    //
+    template<typename T>
+    [[nodiscard]] CanonCorrResult<T>
+    canon_corr(std::vector<const char *> &&X_col_names,
+               std::vector<const char *> &&Y_col_names) const;
+
     // This function returns a DataFrame indexed by std::string that provides
     // a few statistics about the columns of the calling DataFrame.
     // The statistics are:

diff --git a/include/DataFrame/DataFrameTypes.h b/include/DataFrame/DataFrameTypes.h
@@ -733,6 +733,27 @@ struct  PCAParams  {
 
 // ----------------------------------------------------------------------------
 
+// Canonical correlation analysis result
+//
+template<typename T>
+struct  CanonCorrResult  {
+
+    // These values represent the strength of the linear relationship between
+    // each pair of canonical variates, ranging from -1 to 1, with higher
+    // absolute values signifying a stronger association.
+    //
+    std::vector<T>  coeffs { };     // Canonical correlation coefficients
+
+    // The Redundancy Index is a measure that indicates how much variance in
+    // one set of variables is explained by the linear combination of the other
+    // set of variables. This was proposed by Stewart and Love (1968).
+    //
+    T               x_red_idx { };  // Redundancy index for X
+    T               y_red_idx { };  // Redundancy index for Y
+};
+
+// ----------------------------------------------------------------------------
+
 template<typename T>
 struct  RandGenParams  {
 

diff --git a/include/DataFrame/Internals/DataFrame_get.tcc b/include/DataFrame/Internals/DataFrame_get.tcc
@@ -1052,6 +1052,96 @@ compact_svd(std::vector<const char *> &&col_names,
     return (std::make_tuple(U, S, V));
 }
 
+// ----------------------------------------------------------------------------
+
+template<typename I, typename H>
+template<typename T>
+CanonCorrResult<T> DataFrame<I, H>::
+canon_corr(std::vector<const char *> &&X_col_names,
+           std::vector<const char *> &&Y_col_names) const  {
+
+    using col_mat_t = Matrix<T, matrix_orient::column_major>;
+
+#ifdef HMDF_SANITY_EXCEPTIONS
+    if (X_col_names.size() != Y_col_names.size())
+        throw NotFeasible("canon_corr(): "
+                          "Two sets must have same number of variables");
+#endif // HMDF_SANITY_EXCEPTIONS
+
+    size_type                               min_col_s { indices_.size() };
+    std::vector<const ColumnVecType<T> *>   columns
+        (X_col_names.size() + Y_col_names.size(), nullptr);
+    SpinGuard                               guard { lock_ };
+
+    for (size_type i { 0 }; i < X_col_names.size(); ++i)  {
+        columns[i] = &get_column<T>(X_col_names[i], false);
+        if (columns[i]->size() < min_col_s)
+            min_col_s = columns[i]->size();
+    }
+    for (size_type i { 0 }; i < Y_col_names.size(); ++i)  {
+        const size_type idx = i + X_col_names.size();
+
+        columns[idx] = &get_column<T>(Y_col_names[i], false);
+        if (columns[idx]->size() < min_col_s)
+            min_col_s = columns[idx]->size();
+    }
+    guard.release();
+
+    col_mat_t   X { long(min_col_s), long(X_col_names.size()) };
+
+    for (size_type i { 0 }; i < X_col_names.size(); ++i)
+        X.set_column(columns[i]->begin(), i);
+
+    col_mat_t   Y { long(min_col_s), long(Y_col_names.size()) };
+
+    for (size_type i { 0 }; i < Y_col_names.size(); ++i)
+        Y.set_column(columns[i + X_col_names.size()]->begin(), i);
+
+    const auto  XY_cov = _calc_centered_cov_(X, Y);
+    const auto  X_cov = _calc_centered_cov_(X, X);
+    const auto  Y_cov = _calc_centered_cov_(Y, Y);
+    const auto  sq_root_mat =
+        X_cov.inverse() * XY_cov * Y_cov.inverse() * XY_cov.transpose();
+    col_mat_t   U;
+    col_mat_t   S;
+    col_mat_t   V;
+
+    sq_root_mat.svd(U, S, V, false);
+
+    CanonCorrResult<T>  result;
+
+    result.coeffs.reserve(S.rows());
+    for (long i { 0 }; i < S.rows(); ++i)
+        result.coeffs.push_back(S(i, 0));
+
+    T   X_cov_diag_sum { 0 };
+    T   Y_cov_diag_sum { 0 };
+
+    for (long i { 0 }; i < X_cov.rows(); ++i)  {
+        X_cov_diag_sum += X_cov(i, i);
+        Y_cov_diag_sum += Y_cov(i, i);
+    }
+
+    T   redun { 0 };
+
+    for (long i { 0 }; i < X_cov.rows(); ++i)  {
+        const T S_val = S(i, 0);
+
+        redun += S_val * S_val * X_cov(i, i);
+    }
+    result.x_red_idx = redun / X_cov_diag_sum;
+
+    redun = 0;
+    for (long i { 0 }; i < Y_cov.rows(); ++i)  {
+        const T S_val = S(i, 0);
+
+        redun += S_val * S_val * Y_cov(i, i);
+    }
+    result.y_red_idx = redun / Y_cov_diag_sum;
+
+    return (result);
+}
+
 } // namespace hmdf
 
 // ----------------------------------------------------------------------------