From fc80fee714ba8da14c8450baa6b21e333bde0f15 Mon Sep 17 00:00:00 2001 From: Severin Dicks <37635888+Intron7@users.noreply.github.com> Date: Mon, 7 Aug 2023 17:56:52 +0200 Subject: [PATCH] Backport PR #2589: Fixed wrong order for groups with logreg --- docs/release-notes/1.9.4.md | 1 + scanpy/tests/test_rank_genes_groups_logreg.py | 32 +++++++++++++++---- scanpy/tools/_rank_genes_groups.py | 11 +++++-- 3 files changed, 34 insertions(+), 10 deletions(-) diff --git a/docs/release-notes/1.9.4.md b/docs/release-notes/1.9.4.md index 4ee17c4d6e..1943c8d69a 100644 --- a/docs/release-notes/1.9.4.md +++ b/docs/release-notes/1.9.4.md @@ -6,3 +6,4 @@ * Support scikit-learn 1.3 {pr}`2515` {smaller}`P Angerer` * Deal with `None` value vanishing from things like `.uns['log1p']` {pr}`2546` {smaller}`SP Shen` * Depend on `igraph` instead of `python-igraph` {pr}`2566` {smaller}`P Angerer` +* {func}`~scanpy.tl.rank_genes_groups` now handles unsorted groups as intended {pr}`2589` {smaller}`S Dicks` diff --git a/scanpy/tests/test_rank_genes_groups_logreg.py b/scanpy/tests/test_rank_genes_groups_logreg.py index c7808e74e2..ce9d024745 100644 --- a/scanpy/tests/test_rank_genes_groups_logreg.py +++ b/scanpy/tests/test_rank_genes_groups_logreg.py @@ -2,12 +2,10 @@ import numpy as np import scanpy as sc +import pandas as pd -@pytest.mark.parametrize( - "method", - ["t-test", "logreg"], -) +@pytest.mark.parametrize('method', ['t-test', 'logreg']) def test_rank_genes_groups_with_renamed_categories(method): adata = sc.datasets.blobs(n_variables=4, n_centers=3, n_observations=200) assert np.allclose(adata.X[1], [9.214668, -2.6487126, 4.2020774, 0.51076424]) @@ -30,14 +28,34 @@ def test_rank_genes_groups_with_renamed_categories_use_rep(): adata = sc.datasets.blobs(n_variables=4, n_centers=3, n_observations=200) assert np.allclose(adata.X[1], [9.214668, -2.6487126, 4.2020774, 0.51076424]) - adata.layers["to_test"] = adata.X.copy() + adata.layers['to_test'] = adata.X.copy() adata.X = adata.X[::-1, :] sc.tl.rank_genes_groups( - adata, 'blobs', method='logreg', layer="to_test", use_raw=False + adata, 'blobs', method='logreg', layer='to_test', use_raw=False ) assert adata.uns['rank_genes_groups']['names'].dtype.names == ('0', '1', '2') assert adata.uns['rank_genes_groups']['names'][0].tolist() == ('1', '3', '0') - sc.tl.rank_genes_groups(adata, 'blobs', method="logreg") + sc.tl.rank_genes_groups(adata, 'blobs', method='logreg') assert not adata.uns['rank_genes_groups']['names'][0].tolist() == ('3', '1', '0') + + +def test_rank_genes_groups_with_unsorted_groups(): + adata = sc.datasets.blobs(n_variables=10, n_centers=5, n_observations=200) + adata._sanitize() + adata.rename_categories('blobs', ['Zero', 'One', 'Two', 'Three', 'Four']) + bdata = adata.copy() + sc.tl.rank_genes_groups( + adata, 'blobs', groups=['Zero', 'One', 'Three'], method='logreg' + ) + sc.tl.rank_genes_groups( + bdata, 'blobs', groups=['One', 'Three', 'Zero'], method='logreg' + ) + array_ad = pd.DataFrame( + adata.uns['rank_genes_groups']['scores']['Three'] + ).to_numpy() + array_bd = pd.DataFrame( + bdata.uns['rank_genes_groups']['scores']['Three'] + ).to_numpy() + np.testing.assert_equal(array_ad, array_bd) diff --git a/scanpy/tools/_rank_genes_groups.py b/scanpy/tools/_rank_genes_groups.py index 716f7efc21..ac00b2d627 100644 --- a/scanpy/tools/_rank_genes_groups.py +++ b/scanpy/tools/_rank_genes_groups.py @@ -341,12 +341,17 @@ def logreg(self, **kwds): clf = LogisticRegression(**kwds) clf.fit(X, self.grouping.cat.codes) scores_all = clf.coef_ - for igroup, _ in enumerate(self.groups_order): + # not all codes necessarily appear in data + existing_codes = np.unique(self.grouping.cat.codes) + for igroup, cat in enumerate(self.groups_order): if len(self.groups_order) <= 2: # binary logistic regression scores = scores_all[0] else: - scores = scores_all[igroup] - + # cat code is index of cat value in .categories + cat_code: int = np.argmax(self.grouping.cat.categories == cat) + # index of scores row is index of cat code in array of existing codes + scores_idx: int = np.argmax(existing_codes == cat_code) + scores = scores_all[scores_idx] yield igroup, scores, None if len(self.groups_order) <= 2: