From ce7545df20cf787cadfe7c1bb2998c23240116ef Mon Sep 17 00:00:00 2001
From: Raphael Vallat <raphaelvallat9@gmail.com>
Date: Wed, 31 Mar 2021 10:06:28 -0700
Subject: [PATCH] Fix invalid computation of skipped correlation + improved
 testing

---
 docs/changelog.rst                 |  5 +++--
 pingouin/correlation.py            |  6 +++---
 pingouin/tests/test_correlation.py | 26 +++++++++++++++++++-------
 3 files changed, 25 insertions(+), 12 deletions(-)

diff --git a/docs/changelog.rst b/docs/changelog.rst
index 9b27e865..56f18e40 100644
--- a/docs/changelog.rst
+++ b/docs/changelog.rst
@@ -8,8 +8,9 @@ v0.3.11 (unreleased)
 
 **Bugfixes**
 
-a. Passing a wrong ``tail`` argument to :py:func:`pingouin.correlation.corr` now *always* raises an error (see `PR 160 <https://github.com/raphaelvallat/pingouin/pull/160>`_).
-   In previous versions of ``pingouin``, using any ``method`` other than ``"pearson"`` and a wrong ``tail`` argument such as ``"two-tailed"`` or ``"both"``
+a. Fix invalid computation of the robust skipped correlation in :py:func:`pingouin.correlation.corr` (see `issue 164 <https://github.com/raphaelvallat/pingouin/issues/164>`_).
+b. Passing a wrong ``tail`` argument to :py:func:`pingouin.corr` now *always* raises an error (see `PR 160 <https://github.com/raphaelvallat/pingouin/pull/160>`_).
+   In previous versions of pingouin, using any ``method`` other than ``"pearson"`` and a wrong ``tail`` argument such as ``"two-tailed"`` or ``"both"``
    (instead of the correct ``"two-sided"``) may have resulted in silently returning a one-sided p-value.
 
 v0.3.10 (February 2021)
diff --git a/pingouin/correlation.py b/pingouin/correlation.py
index e98c30c7..fa5bf7e1 100644
--- a/pingouin/correlation.py
+++ b/pingouin/correlation.py
@@ -73,13 +73,13 @@ def skipped(x, y, corr_type='spearman'):
     # Compute center and distance to center
     center = MinCovDet(random_state=42).fit(X).location_
     B = X - center
-    B2 = B**2
-    bot = B2.sum(axis=1)
+    bot = (B**2).sum(axis=1)
     # Loop over rows
     dis = np.zeros(shape=(nrows, nrows))
     for i in np.arange(nrows):
         if bot[i] != 0:  # Avoid division by zero error
-            dis[i, :] = np.linalg.norm(B * B2[i, :] / bot[i], axis=1)
+            dis[i, :] = np.linalg.norm(
+                B.dot(B[i, :, None]) * B[i, :] / bot[i], axis=1)
 
     # Detect outliers
     def idealf(x):
diff --git a/pingouin/tests/test_correlation.py b/pingouin/tests/test_correlation.py
index fffdad25..39462b33 100644
--- a/pingouin/tests/test_correlation.py
+++ b/pingouin/tests/test_correlation.py
@@ -14,30 +14,42 @@ def test_corr(self):
         np.random.seed(123)
         mean, cov = [4, 6], [(1, .6), (.6, 1)]
         x, y = np.random.multivariate_normal(mean, cov, 30).T
+        x2, y2 = x.copy(), y.copy()
         x[3], y[5] = 12, -8
+        x2[3], y2[5] = 7, 2.6
         corr(x, y, method='pearson', tail='one-sided')
         corr(x, y, method='spearman', tail='two-sided')
         corr(x, y, method='kendall')
         corr(x, y, method='shepherd', tail='two-sided')
-        # Compare with robust corr toolbox
+        # Skipped correlation -- compare with robust corr toolbox
+        # https://sourceforge.net/projects/robustcorrtool/
         stats = corr(x, y, method='skipped')
-        assert np.round(stats['r'].to_numpy(), 3) == 0.512
+        assert np.round(stats['r'].to_numpy(), 4) == 0.5123
         assert stats['outliers'].to_numpy() == 2
-        # Changing the method using kwargs
-        sk_sp = corr(x, y, method='skipped', corr_type='spearman')
-        sk_pe = corr(x, y, method='skipped', corr_type='pearson')
+        sk_sp = corr(x2, y2, method='skipped')
+        assert np.round(sk_sp['r'].to_numpy(), 4) == 0.5123
+        assert sk_sp['outliers'].to_numpy() == 2
+        # Pearson skipped correlation
+        sk_pe = corr(x2, y2, method='skipped', corr_type='pearson')
+        assert np.round(sk_pe['r'].to_numpy(), 4) == 0.5254
+        assert sk_pe['outliers'].to_numpy() == 2
         assert not sk_sp.equals(sk_pe)
+        # Shepherd -- cannot directly compare because based on random bootstrap
         stats = corr(x, y, method='shepherd')
         assert stats['outliers'].to_numpy() == 2
         _, _, outliers = skipped(x, y, corr_type='pearson')
         assert outliers.size == x.size
         assert stats['n'].to_numpy() == 30
+        # Percbend -- compare with robust corr toolbox
         stats = corr(x, y, method='percbend')
-        assert np.round(stats['r'].to_numpy(), 3) == 0.484
+        assert np.round(stats['r'].to_numpy(), 4) == 0.4843
+        stats = corr(x2, y2, method='percbend')
+        assert np.round(stats['r'].to_numpy(), 4) == 0.4843
+        stats = corr(x, y, method='percbend', beta=.5)
+        assert np.round(stats['r'].to_numpy(), 4) == 0.4848
         # Compare biweight correlation to astropy
         stats = corr(x, y, method='bicor')
         assert np.isclose(stats['r'].to_numpy(), 0.4951417784979)
-        # Changing the value of C using kwargs
         stats = corr(x, y, method='bicor', c=5)
         assert np.isclose(stats['r'].to_numpy(), 0.4940706950017)
         # Not normally distributed