diff --git a/README.md b/README.md index 3ac9539a..70171c97 100644 --- a/README.md +++ b/README.md @@ -64,7 +64,7 @@ Each program has three identical parts. First it generates and populates 3 colum The maximum dataset I could load into Polars was 300m rows per column. Any bigger dataset blew up the memory and caused OS to kill it. I ran C++ DataFrame with 10b rows per column and I am sure it would have run with bigger datasets too. So, I was forced to run both with 300m rows to compare. I ran each test 4 times and took the best time. Polars numbers varied a lot from one run to another, especially calculation and selection times. C++ DataFrame numbers were significantly more consistent. -| | [<B>C++ DataFrame</B>](https://github.com/hosseinmoein/DataFrame/blob/master/benchmarks/dataframe_performance.cc) | [<B>Polars</B> ](https://github.com/hosseinmoein/DataFrame/blob/master/benchmarks/polars_performance.py) | [<B>Pandas</B> ](https://github.com/hosseinmoein/DataFrame/blob/master/benchmarks/pandas_performance.py) | +| | [<B>C++ DataFrame</B>](https://github.com/hosseinmoein/DataFrame/blob/master/benchmarks/dataframe_performance.cc) | [  <B>Polars</B> ](https://github.com/hosseinmoein/DataFrame/blob/master/benchmarks/polars_performance.py) | [  <B>Pandas</B> ](https://github.com/hosseinmoein/DataFrame/blob/master/benchmarks/pandas_performance.py) | | :-- | ---: | ---: | ---: | | Data generation/load time | 26.9459 secs | 28.4686 secs | 36.6799 secs | | Calculation time | 1.2602 secs | 4.8766 secs | 40.3264 secs | diff --git a/docs/HTML/DataFrame.html b/docs/HTML/DataFrame.html index c7294164..e95ae157 100644 --- a/docs/HTML/DataFrame.html +++ b/docs/HTML/DataFrame.html @@ -253,6 +253,10 @@ <H2 ID="2"><font color="blue">API Reference with code samples <font size="+4">&# <td title="These are used to get information from data" style="text-align:center;background-color:LightGrey;color:DarkBlue">Getting Information <font size="+3">💁</font></td> </tr> + <tr class="item" onmouseover="this.style.backgroundColor='#ffff66';" onmouseout="this.style.backgroundColor='#d4e3e5';"> + <td title="Performs canonical correlation analysis between two sets of columns"><a href="https://htmlpreview.github.io/?https://github.com/hosseinmoein/DataFrame/blob/master/docs/HTML/canon_corr.html">canon_corr</a>()</td> + </tr> + <tr class="item" onmouseover="this.style.backgroundColor='#ffff66';" onmouseout="this.style.backgroundColor='#d4e3e5';"> <td title="Get column index for the given column name"><a href="https://htmlpreview.github.io/?https://github.com/hosseinmoein/DataFrame/blob/master/docs/HTML/col_name_to_idx.html">col_name_to_idx</a>()</td> </tr> diff --git a/docs/HTML/canon_corr.html b/docs/HTML/canon_corr.html new file mode 100644 index 00000000..a912c683 --- /dev/null +++ b/docs/HTML/canon_corr.html @@ -0,0 +1,142 @@ +<!-- +Copyright (c) 2019-2026, Hossein Moein +All rights reserved. +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: +* Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. +* Redistributions in binary form must reproduce the above copyright + notice, this list of conditions and the following disclaimer in the + documentation and/or other materials provided with the distribution. +* Neither the name of Hossein Moein and/or the DataFrame nor the + names of its contributors may be used to endorse or promote products + derived from this software without specific prior written permission. +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND +ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL Hossein Moein BE LIABLE FOR ANY +DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND +ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +--> +<!DOCTYPE html> +<html> + +<head> +<style> +body { + background-image: linear-gradient(Azure, AliceBlue, GhostWhite, WhiteSmoke); +} + +a { color: #FF5555; } +</style> +</head> + +<body style="font-family: Georgia, serif"> + <font size="+3">←</font> <a href="https://htmlpreview.github.io/?https://github.com/hosseinmoein/DataFrame/blob/master/docs/HTML/DataFrame.html">Back to Documentations</a><BR><BR> + + <table border="1"> + + <tr bgcolor="lightblue"> + <th>Signature</th> <th>Description</th> + </tr> + + <tr bgcolor="Azure"> + <td bgcolor="blue"> <font color="white"> + <PRE><B> +template<typename T> +struct CanonCorrResult { + + // These values represent the strength of the linear relationship between + // each pair of canonical variates, ranging from -1 to 1, with higher + // absolute values signifying a stronger association. + // + std::vector<T> coeffs { }; // Canonical correlation coefficients + + // The Redundancy Index is a measure that indicates how much variance in + // one set of variables is explained by the linear combination of the other + // set of variables. This was proposed by Stewart and Love (1968). + // + T x_red_idx { }; // Redundancy index for X + T y_red_idx { }; // Redundancy index for Y +}; + </font> + </B></PRE> + </td> + <td> + Result of Canonical Correlation Analysis as returned by canon_corr() interface<BR> + </td> + </tr> + + </table> + + <BR> + <table border="1"> + + <tr bgcolor="lightblue"> + <th>Signature</th> <th>Description</th> <th>Parameters</th> + </tr> + + <tr bgcolor="Azure"> + <td bgcolor="blue"> <font color="white"> +<PRE><B> +template<typename T> +CanonCorrResult<T> +canon_corr(std::vector<const char *> &&X_col_names, + std::vector<const char *> &&Y_col_names) const; +</B></PRE></font> + </td> + <td> + This performs Canonical Correlation Analysis (CCA) between two sets of columns <I>X</I> and <I>Y</I>. It returns the result in a struct defined above.<BR> + CCA is a statistical method for examining and measuring correlations between two sets of variables. Fundamentally, CCA looks for linear combinations of variables, also referred to as canonical variables, within each set so that the correlation between them is maximized. Finding relationships and patterns of linkage between the two groups is the main objective.<BR><BR> + + <B>NOTE</B>: Number of columns in each set must be the same<BR> + </td> + <td width="28%"> + <B>T</B>: Type of the named columns<BR> + <B>X_col_names</B>: Names of the first set of columns<BR> + <B>Y_col_names</B>: Names of the second set of columns<BR> + </td> + </tr> + + </table> + +<pre class="code_syntax" style="color:#000000;background:#ffffff00;"><span class="line_wrapper"><span style="color:#800000; font-weight:bold; ">static</span> <span style="color:#800000; font-weight:bold; ">void</span> test_canon_corr<span style="color:#808030; ">(</span><span style="color:#808030; ">)</span> <span style="color:#800080; ">{</span></span> +<span class="line_wrapper"></span> +<span class="line_wrapper"> <span style="color:#666616; ">std</span><span style="color:#800080; ">::</span><span style="color:#603000; ">cout</span> <span style="color:#808030; "><</span><span style="color:#808030; "><</span> <span style="color:#800000; ">"</span><span style="color:#0f69ff; ">\n</span><span style="color:#0000e6; ">Testing canon_corr( ) ...</span><span style="color:#800000; ">"</span> <span style="color:#808030; "><</span><span style="color:#808030; "><</span> <span style="color:#666616; ">std</span><span style="color:#800080; ">::</span><span style="color:#603000; ">endl</span><span style="color:#800080; ">;</span></span> +<span class="line_wrapper"></span> +<span class="line_wrapper"> StrDataFrame df<span style="color:#800080; ">;</span></span> +<span class="line_wrapper"></span> +<span class="line_wrapper"> <span style="color:#800000; font-weight:bold; ">try</span> <span style="color:#800080; ">{</span></span> +<span class="line_wrapper"> df<span style="color:#808030; ">.</span><span style="color:#603000; ">read</span><span style="color:#808030; ">(</span><span style="color:#800000; ">"</span><span style="color:#0000e6; ">IBM.csv</span><span style="color:#800000; ">"</span><span style="color:#808030; ">,</span> io_format<span style="color:#800080; ">::</span>csv2<span style="color:#808030; ">)</span><span style="color:#800080; ">;</span></span> +<span class="line_wrapper"> <span style="color:#800080; ">}</span></span> +<span class="line_wrapper"> <span style="color:#800000; font-weight:bold; ">catch</span> <span style="color:#808030; ">(</span><span style="color:#800000; font-weight:bold; ">const</span> DataFrameError <span style="color:#808030; ">&</span>ex<span style="color:#808030; ">)</span> <span style="color:#800080; ">{</span></span> +<span class="line_wrapper"> <span style="color:#666616; ">std</span><span style="color:#800080; ">::</span><span style="color:#603000; ">cout</span> <span style="color:#808030; "><</span><span style="color:#808030; "><</span> ex<span style="color:#808030; ">.</span>what<span style="color:#808030; ">(</span><span style="color:#808030; ">)</span> <span style="color:#808030; "><</span><span style="color:#808030; "><</span> <span style="color:#666616; ">std</span><span style="color:#800080; ">::</span><span style="color:#603000; ">endl</span><span style="color:#800080; ">;</span></span> +<span class="line_wrapper"> <span style="color:#800080; ">}</span></span> +<span class="line_wrapper"></span> +<span class="line_wrapper"> <span style="color:#800000; font-weight:bold; ">const</span> <span style="color:#800000; font-weight:bold; ">auto</span> result <span style="color:#808030; ">=</span> df<span style="color:#808030; ">.</span>canon_corr<span style="color:#800080; "><</span><span style="color:#800000; font-weight:bold; ">double</span><span style="color:#800080; ">></span><span style="color:#808030; ">(</span><span style="color:#800080; ">{</span> <span style="color:#800000; ">"</span><span style="color:#0000e6; ">IBM_Close</span><span style="color:#800000; ">"</span><span style="color:#808030; ">,</span> <span style="color:#800000; ">"</span><span style="color:#0000e6; ">IBM_Open</span><span style="color:#800000; ">"</span> <span style="color:#800080; ">}</span><span style="color:#808030; ">,</span> <span style="color:#800080; ">{</span> <span style="color:#800000; ">"</span><span style="color:#0000e6; ">IBM_High</span><span style="color:#800000; ">"</span><span style="color:#808030; ">,</span> <span style="color:#800000; ">"</span><span style="color:#0000e6; ">IBM_Low</span><span style="color:#800000; ">"</span> <span style="color:#800080; ">}</span><span style="color:#808030; ">)</span><span style="color:#800080; ">;</span></span> +<span class="line_wrapper"></span> +<span class="line_wrapper"> assert<span style="color:#808030; ">(</span>result<span style="color:#808030; ">.</span>coeffs<span style="color:#808030; ">.</span>size<span style="color:#808030; ">(</span><span style="color:#808030; ">)</span> <span style="color:#808030; ">=</span><span style="color:#808030; ">=</span> <span style="color:#008c00; ">2</span><span style="color:#808030; ">)</span><span style="color:#800080; ">;</span></span> +<span class="line_wrapper"> assert<span style="color:#808030; ">(</span><span style="color:#666616; ">std</span><span style="color:#800080; ">::</span><span style="color:#603000; ">fabs</span><span style="color:#808030; ">(</span>result<span style="color:#808030; ">.</span>coeffs<span style="color:#808030; ">[</span><span style="color:#008c00; ">0</span><span style="color:#808030; ">]</span> <span style="color:#808030; ">-</span> <span style="color:#008000; ">0.999944</span><span style="color:#808030; ">)</span> <span style="color:#808030; "><</span> <span style="color:#008000; ">0.000001</span><span style="color:#808030; ">)</span><span style="color:#800080; ">;</span></span> +<span class="line_wrapper"> assert<span style="color:#808030; ">(</span><span style="color:#666616; ">std</span><span style="color:#800080; ">::</span><span style="color:#603000; ">fabs</span><span style="color:#808030; ">(</span>result<span style="color:#808030; ">.</span>coeffs<span style="color:#808030; ">[</span><span style="color:#008c00; ">1</span><span style="color:#808030; ">]</span> <span style="color:#808030; ">-</span> <span style="color:#008000; ">0.262927</span><span style="color:#808030; ">)</span> <span style="color:#808030; "><</span> <span style="color:#008000; ">0.000001</span><span style="color:#808030; ">)</span><span style="color:#800080; ">;</span></span> +<span class="line_wrapper"> assert<span style="color:#808030; ">(</span><span style="color:#666616; ">std</span><span style="color:#800080; ">::</span><span style="color:#603000; ">fabs</span><span style="color:#808030; ">(</span>result<span style="color:#808030; ">.</span>x_red_idx <span style="color:#808030; ">-</span> <span style="color:#008000; ">0.534073</span><span style="color:#808030; ">)</span> <span style="color:#808030; "><</span> <span style="color:#008000; ">0.000001</span><span style="color:#808030; ">)</span><span style="color:#800080; ">;</span></span> +<span class="line_wrapper"> assert<span style="color:#808030; ">(</span><span style="color:#666616; ">std</span><span style="color:#800080; ">::</span><span style="color:#603000; ">fabs</span><span style="color:#808030; ">(</span>result<span style="color:#808030; ">.</span>y_red_idx <span style="color:#808030; ">-</span> <span style="color:#008000; ">0.535897</span><span style="color:#808030; ">)</span> <span style="color:#808030; "><</span> <span style="color:#008000; ">0.000001</span><span style="color:#808030; ">)</span><span style="color:#800080; ">;</span></span> +<span class="line_wrapper"><span style="color:#800080; ">}</span></span> +<span class="line_wrapper"></span></pre> + + <BR><img src="https://github.com/hosseinmoein/DataFrame/blob/master/docs/LionLookingUp.jpg?raw=true" alt="C++ DataFrame" + width="200" height="200" style="float:right"/> + +</body> +</html> + +<!-- +Local Variables: +mode:HTML +tab-width:4 +c-basic-offset:4 +End: +--> diff --git a/include/DataFrame/DataFrame.h b/include/DataFrame/DataFrame.h index 58315e39..1f82edc8 100644 --- a/include/DataFrame/DataFrame.h +++ b/include/DataFrame/DataFrame.h @@ -3891,6 +3891,30 @@ class DataFrame : public ThreadGranularity { normalization_type norm_type = normalization_type::z_score) const; + // This performs Canonical Correlation Analysis (CCA) between two sets of + // columns // X and Y. It returns the result in a struct defined above. + // + // CCA is a statistical method for examining and measuring correlations + // between two sets of variables. Fundamentally, CCA looks for linear + // combinations of variables, also referred to as canonical variables, + // within each set so that the correlation between them is maximized. + // Finding relationships and patterns of linkage between the two groups + // is the main objective. + // + // NOTE: Number of columns in each set must be the same + // + // T: + // Type of the named columns + // X_col_names: + // Names of the first set of columns + // Y_col_names: + // Names of the second set of columns + // + template<typename T> + [[nodiscard]] CanonCorrResult<T> + canon_corr(std::vector<const char *> &&X_col_names, + std::vector<const char *> &&Y_col_names) const; + // This function returns a DataFrame indexed by std::string that provides // a few statistics about the columns of the calling DataFrame. // The statistics are: diff --git a/include/DataFrame/DataFrameTypes.h b/include/DataFrame/DataFrameTypes.h index 5ec0d834..11efc75b 100644 --- a/include/DataFrame/DataFrameTypes.h +++ b/include/DataFrame/DataFrameTypes.h @@ -733,6 +733,27 @@ struct PCAParams { // ---------------------------------------------------------------------------- +// Canonical correlation analysis result +// +template<typename T> +struct CanonCorrResult { + + // These values represent the strength of the linear relationship between + // each pair of canonical variates, ranging from -1 to 1, with higher + // absolute values signifying a stronger association. + // + std::vector<T> coeffs { }; // Canonical correlation coefficients + + // The Redundancy Index is a measure that indicates how much variance in + // one set of variables is explained by the linear combination of the other + // set of variables. This was proposed by Stewart and Love (1968). + // + T x_red_idx { }; // Redundancy index for X + T y_red_idx { }; // Redundancy index for Y +}; + +// ---------------------------------------------------------------------------- + template<typename T> struct RandGenParams { diff --git a/include/DataFrame/Internals/DataFrame_get.tcc b/include/DataFrame/Internals/DataFrame_get.tcc index 38998c9a..a61223f6 100644 --- a/include/DataFrame/Internals/DataFrame_get.tcc +++ b/include/DataFrame/Internals/DataFrame_get.tcc @@ -1052,6 +1052,96 @@ compact_svd(std::vector<const char *> &&col_names, return (std::make_tuple(U, S, V)); } +// ---------------------------------------------------------------------------- + +template<typename I, typename H> +template<typename T> +CanonCorrResult<T> DataFrame<I, H>:: +canon_corr(std::vector<const char *> &&X_col_names, + std::vector<const char *> &&Y_col_names) const { + + using col_mat_t = Matrix<T, matrix_orient::column_major>; + +#ifdef HMDF_SANITY_EXCEPTIONS + if (X_col_names.size() != Y_col_names.size()) + throw NotFeasible("canon_corr(): " + "Two sets must have same number of variables"); +#endif // HMDF_SANITY_EXCEPTIONS + + size_type min_col_s { indices_.size() }; + std::vector<const ColumnVecType<T> *> columns + (X_col_names.size() + Y_col_names.size(), nullptr); + SpinGuard guard { lock_ }; + + for (size_type i { 0 }; i < X_col_names.size(); ++i) { + columns[i] = &get_column<T>(X_col_names[i], false); + if (columns[i]->size() < min_col_s) + min_col_s = columns[i]->size(); + } + for (size_type i { 0 }; i < Y_col_names.size(); ++i) { + const size_type idx = i + X_col_names.size(); + + columns[idx] = &get_column<T>(Y_col_names[i], false); + if (columns[idx]->size() < min_col_s) + min_col_s = columns[idx]->size(); + } + guard.release(); + + col_mat_t X { long(min_col_s), long(X_col_names.size()) }; + + for (size_type i { 0 }; i < X_col_names.size(); ++i) + X.set_column(columns[i]->begin(), i); + + col_mat_t Y { long(min_col_s), long(Y_col_names.size()) }; + + for (size_type i { 0 }; i < Y_col_names.size(); ++i) + Y.set_column(columns[i + X_col_names.size()]->begin(), i); + + const auto XY_cov = _calc_centered_cov_(X, Y); + const auto X_cov = _calc_centered_cov_(X, X); + const auto Y_cov = _calc_centered_cov_(Y, Y); + const auto sq_root_mat = + X_cov.inverse() * XY_cov * Y_cov.inverse() * XY_cov.transpose(); + col_mat_t U; + col_mat_t S; + col_mat_t V; + + sq_root_mat.svd(U, S, V, false); + + CanonCorrResult<T> result; + + result.coeffs.reserve(S.rows()); + for (long i { 0 }; i < S.rows(); ++i) + result.coeffs.push_back(S(i, 0)); + + T X_cov_diag_sum { 0 }; + T Y_cov_diag_sum { 0 }; + + for (long i { 0 }; i < X_cov.rows(); ++i) { + X_cov_diag_sum += X_cov(i, i); + Y_cov_diag_sum += Y_cov(i, i); + } + + T redun { 0 }; + + for (long i { 0 }; i < X_cov.rows(); ++i) { + const T S_val = S(i, 0); + + redun += S_val * S_val * X_cov(i, i); + } + result.x_red_idx = redun / X_cov_diag_sum; + + redun = 0; + for (long i { 0 }; i < Y_cov.rows(); ++i) { + const T S_val = S(i, 0); + + redun += S_val * S_val * Y_cov(i, i); + } + result.y_red_idx = redun / Y_cov_diag_sum; + + return (result); +} + } // namespace hmdf // ---------------------------------------------------------------------------- diff --git a/include/DataFrame/Internals/DataFrame_standalone.tcc b/include/DataFrame/Internals/DataFrame_standalone.tcc index ecf10387..76c28852 100644 --- a/include/DataFrame/Internals/DataFrame_standalone.tcc +++ b/include/DataFrame/Internals/DataFrame_standalone.tcc @@ -32,6 +32,7 @@ SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #include <DataFrame/Utils/DateTime.h> #include <DataFrame/Utils/Endianness.h> #include <DataFrame/Utils/FixedSizeString.h> +#include <DataFrame/Utils/Matrix.h> #include <DataFrame/Utils/Threads/ThreadGranularity.h> #include <cctype> @@ -1868,6 +1869,37 @@ _read_binary_str_dbl_map_(STRM &strm, V &map_vec, bool needs_flipping, // ---------------------------------------------------------------------------- +template<typename MA> +inline static typename std::remove_reference<MA>::type +_calc_centered_cov_(const MA &mat1, const MA &mat2) { + + using mat_t = typename std::remove_reference<MA>::type; + + mat_t X; + mat_t Y; + + mat1.get_centered(X); + mat2.get_centered(Y); + + mat_t result = X.transpose2() * Y; + const typename mat_t::value_type denom = X.rows() - 1; + + if constexpr (result.orientation() == matrix_orient::column_major) { + for (long c = 0; c < result.cols(); ++c) + for (long r = 0; r < result.rows(); ++r) + result(r, c) /= denom; + } + else { + for (long r = 0; r < result.rows(); ++r) + for (long c = 0; c < result.cols(); ++c) + result(r, c) /= denom; + } + + return (result); +} + +// ---------------------------------------------------------------------------- + // // Specializing std::hash for tuples // diff --git a/include/DataFrame/Utils/Matrix.tcc b/include/DataFrame/Utils/Matrix.tcc index a94c5725..4edf41fa 100644 --- a/include/DataFrame/Utils/Matrix.tcc +++ b/include/DataFrame/Utils/Matrix.tcc @@ -1447,8 +1447,8 @@ svd(MA1 &U, MA2 &S, MA3 &V, bool full_size) const { const size_type min_dem = std::min(rows(), cols()); #ifdef HMDF_SANITY_EXCEPTIONS - if (min_dem < 3) - throw DataFrameError("Matrix::svd(): MAtrix is too small"); + if (min_dem < 2) + throw DataFrameError("Matrix::svd(): Matrix is too small"); #endif // HMDF_SANITY_EXCEPTIONS Matrix self_tmp = *this; @@ -2059,9 +2059,24 @@ inline void Matrix<T, MO, IS_SYM>::adjoint (MA &that) const { #endif // HMDF_SANITY_EXCEPTIONS that.resize(rows(), cols()); - for (size_type r = 0; r < rows(); ++r) - for (size_type c = 0; c < cols(); ++c) - that(c, r) = cofactor(r, c); + + auto lbd = + [&that, this](auto begin, auto end) -> void { + for (size_type r = begin; r < end; ++r) + for (size_type c = begin; c < end; ++c) + that(c, r) = cofactor(r, c); + }; + const long thread_level = + (cols() >= 10L) ? ThreadGranularity::get_thread_level() : 0; + + if (thread_level > 2) { + auto futures = + ThreadGranularity::thr_pool_.parallel_loop(0L, cols(), + std::move(lbd)); + + for (auto &fut : futures) fut.get(); + } + else lbd(0L, cols()); return; } diff --git a/src/Utils/DateTime.cc b/src/Utils/DateTime.cc index a274f924..83ecb36f 100644 --- a/src/Utils/DateTime.cc +++ b/src/Utils/DateTime.cc @@ -364,7 +364,7 @@ DateTime &DateTime::operator = (const char *s) { while (::isspace (*str)) ++str; - std::size_t str_len = std::strlen (str); + std::size_t str_len = std::strlen (str); assert(str_len > 3); @@ -591,6 +591,10 @@ bool DateTime::is_us_business_day () const noexcept { (w_day == DT_WEEKDAY::THU && (m_day >= 22 && m_day <= 28) && mon == DT_MONTH::NOV) || + // President Jimmy Carter day of mourning (Jan 9, 2025) + // + (date_part == 20250109) || + is_xmas ())); } diff --git a/test/dataframe_tester_4.cc b/test/dataframe_tester_4.cc index bff8578a..059faf72 100644 --- a/test/dataframe_tester_4.cc +++ b/test/dataframe_tester_4.cc @@ -2492,7 +2492,7 @@ static void test_get_data_by_spectral() { assert(result_view[1].get_index()[46] == "2019-01-22"); assert(result_view[1].get_column<double>("IBM_High")[20] == 121.68); assert(result_view[1].get_column<long>("IBM_Volume")[35] == 4346700); - + assert(result_df[2].get_index().size() == 452); assert(result_df[2].get_column<double>("IBM_Open").size() == 452); assert(result_df[2].get_index()[0] == "2017-12-20"); @@ -2509,6 +2509,31 @@ static void test_get_data_by_spectral() { // ---------------------------------------------------------------------------- +static void test_canon_corr() { + + std::cout << "\nTesting canon_corr( ) ..." << std::endl; + + StrDataFrame df; + + try { + df.read("IBM.csv", io_format::csv2); + } + catch (const DataFrameError &ex) { + std::cout << ex.what() << std::endl; + } + + const auto result = df.canon_corr<double>({ "IBM_Close", "IBM_Open" }, + { "IBM_High", "IBM_Low" }); + + assert(result.coeffs.size() == 2); + assert(std::fabs(result.coeffs[0] - 0.999944) < 0.000001); + assert(std::fabs(result.coeffs[1] - 0.262927) < 0.000001); + assert(std::fabs(result.x_red_idx - 0.534073) < 0.000001); + assert(std::fabs(result.y_red_idx - 0.535897) < 0.000001); +} + +// ---------------------------------------------------------------------------- + int main(int, char *[]) { MyDataFrame::set_optimum_thread_level(); @@ -2554,6 +2579,7 @@ int main(int, char *[]) { test_compact_svd(); test_SpectralClusteringVisitor(); test_get_data_by_spectral(); + test_canon_corr(); return (0); }