From ce7545df20cf787cadfe7c1bb2998c23240116ef Mon Sep 17 00:00:00 2001 From: Raphael Vallat Date: Wed, 31 Mar 2021 10:06:28 -0700 Subject: [PATCH] Fix invalid computation of skipped correlation + improved testing --- docs/changelog.rst | 5 +++-- pingouin/correlation.py | 6 +++--- pingouin/tests/test_correlation.py | 26 +++++++++++++++++++------- 3 files changed, 25 insertions(+), 12 deletions(-) diff --git a/docs/changelog.rst b/docs/changelog.rst index 9b27e865..56f18e40 100644 --- a/docs/changelog.rst +++ b/docs/changelog.rst @@ -8,8 +8,9 @@ v0.3.11 (unreleased) **Bugfixes** -a. Passing a wrong ``tail`` argument to :py:func:`pingouin.correlation.corr` now *always* raises an error (see `PR 160 `_). - In previous versions of ``pingouin``, using any ``method`` other than ``"pearson"`` and a wrong ``tail`` argument such as ``"two-tailed"`` or ``"both"`` +a. Fix invalid computation of the robust skipped correlation in :py:func:`pingouin.correlation.corr` (see `issue 164 `_). +b. Passing a wrong ``tail`` argument to :py:func:`pingouin.corr` now *always* raises an error (see `PR 160 `_). + In previous versions of pingouin, using any ``method`` other than ``"pearson"`` and a wrong ``tail`` argument such as ``"two-tailed"`` or ``"both"`` (instead of the correct ``"two-sided"``) may have resulted in silently returning a one-sided p-value. v0.3.10 (February 2021) diff --git a/pingouin/correlation.py b/pingouin/correlation.py index e98c30c7..fa5bf7e1 100644 --- a/pingouin/correlation.py +++ b/pingouin/correlation.py @@ -73,13 +73,13 @@ def skipped(x, y, corr_type='spearman'): # Compute center and distance to center center = MinCovDet(random_state=42).fit(X).location_ B = X - center - B2 = B**2 - bot = B2.sum(axis=1) + bot = (B**2).sum(axis=1) # Loop over rows dis = np.zeros(shape=(nrows, nrows)) for i in np.arange(nrows): if bot[i] != 0: # Avoid division by zero error - dis[i, :] = np.linalg.norm(B * B2[i, :] / bot[i], axis=1) + dis[i, :] = np.linalg.norm( + B.dot(B[i, :, None]) * B[i, :] / bot[i], axis=1) # Detect outliers def idealf(x): diff --git a/pingouin/tests/test_correlation.py b/pingouin/tests/test_correlation.py index fffdad25..39462b33 100644 --- a/pingouin/tests/test_correlation.py +++ b/pingouin/tests/test_correlation.py @@ -14,30 +14,42 @@ def test_corr(self): np.random.seed(123) mean, cov = [4, 6], [(1, .6), (.6, 1)] x, y = np.random.multivariate_normal(mean, cov, 30).T + x2, y2 = x.copy(), y.copy() x[3], y[5] = 12, -8 + x2[3], y2[5] = 7, 2.6 corr(x, y, method='pearson', tail='one-sided') corr(x, y, method='spearman', tail='two-sided') corr(x, y, method='kendall') corr(x, y, method='shepherd', tail='two-sided') - # Compare with robust corr toolbox + # Skipped correlation -- compare with robust corr toolbox + # https://sourceforge.net/projects/robustcorrtool/ stats = corr(x, y, method='skipped') - assert np.round(stats['r'].to_numpy(), 3) == 0.512 + assert np.round(stats['r'].to_numpy(), 4) == 0.5123 assert stats['outliers'].to_numpy() == 2 - # Changing the method using kwargs - sk_sp = corr(x, y, method='skipped', corr_type='spearman') - sk_pe = corr(x, y, method='skipped', corr_type='pearson') + sk_sp = corr(x2, y2, method='skipped') + assert np.round(sk_sp['r'].to_numpy(), 4) == 0.5123 + assert sk_sp['outliers'].to_numpy() == 2 + # Pearson skipped correlation + sk_pe = corr(x2, y2, method='skipped', corr_type='pearson') + assert np.round(sk_pe['r'].to_numpy(), 4) == 0.5254 + assert sk_pe['outliers'].to_numpy() == 2 assert not sk_sp.equals(sk_pe) + # Shepherd -- cannot directly compare because based on random bootstrap stats = corr(x, y, method='shepherd') assert stats['outliers'].to_numpy() == 2 _, _, outliers = skipped(x, y, corr_type='pearson') assert outliers.size == x.size assert stats['n'].to_numpy() == 30 + # Percbend -- compare with robust corr toolbox stats = corr(x, y, method='percbend') - assert np.round(stats['r'].to_numpy(), 3) == 0.484 + assert np.round(stats['r'].to_numpy(), 4) == 0.4843 + stats = corr(x2, y2, method='percbend') + assert np.round(stats['r'].to_numpy(), 4) == 0.4843 + stats = corr(x, y, method='percbend', beta=.5) + assert np.round(stats['r'].to_numpy(), 4) == 0.4848 # Compare biweight correlation to astropy stats = corr(x, y, method='bicor') assert np.isclose(stats['r'].to_numpy(), 0.4951417784979) - # Changing the value of C using kwargs stats = corr(x, y, method='bicor', c=5) assert np.isclose(stats['r'].to_numpy(), 0.4940706950017) # Not normally distributed