Skip to content

Commit

Permalink
Backport PR #2589: Fixed wrong order for groups with logreg
Browse files Browse the repository at this point in the history
  • Loading branch information
Intron7 authored and meeseeksmachine committed Aug 7, 2023
1 parent ab2ba2d commit fc80fee
Show file tree
Hide file tree
Showing 3 changed files with 34 additions and 10 deletions.
1 change: 1 addition & 0 deletions docs/release-notes/1.9.4.md
Original file line number Diff line number Diff line change
Expand Up @@ -6,3 +6,4 @@
* Support scikit-learn 1.3 {pr}`2515` {smaller}`P Angerer`
* Deal with `None` value vanishing from things like `.uns['log1p']` {pr}`2546` {smaller}`SP Shen`
* Depend on `igraph` instead of `python-igraph` {pr}`2566` {smaller}`P Angerer`
* {func}`~scanpy.tl.rank_genes_groups` now handles unsorted groups as intended {pr}`2589` {smaller}`S Dicks`
32 changes: 25 additions & 7 deletions scanpy/tests/test_rank_genes_groups_logreg.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,12 +2,10 @@

import numpy as np
import scanpy as sc
import pandas as pd


@pytest.mark.parametrize(
"method",
["t-test", "logreg"],
)
@pytest.mark.parametrize('method', ['t-test', 'logreg'])
def test_rank_genes_groups_with_renamed_categories(method):
adata = sc.datasets.blobs(n_variables=4, n_centers=3, n_observations=200)
assert np.allclose(adata.X[1], [9.214668, -2.6487126, 4.2020774, 0.51076424])
Expand All @@ -30,14 +28,34 @@ def test_rank_genes_groups_with_renamed_categories_use_rep():
adata = sc.datasets.blobs(n_variables=4, n_centers=3, n_observations=200)
assert np.allclose(adata.X[1], [9.214668, -2.6487126, 4.2020774, 0.51076424])

adata.layers["to_test"] = adata.X.copy()
adata.layers['to_test'] = adata.X.copy()
adata.X = adata.X[::-1, :]

sc.tl.rank_genes_groups(
adata, 'blobs', method='logreg', layer="to_test", use_raw=False
adata, 'blobs', method='logreg', layer='to_test', use_raw=False
)
assert adata.uns['rank_genes_groups']['names'].dtype.names == ('0', '1', '2')
assert adata.uns['rank_genes_groups']['names'][0].tolist() == ('1', '3', '0')

sc.tl.rank_genes_groups(adata, 'blobs', method="logreg")
sc.tl.rank_genes_groups(adata, 'blobs', method='logreg')
assert not adata.uns['rank_genes_groups']['names'][0].tolist() == ('3', '1', '0')


def test_rank_genes_groups_with_unsorted_groups():
adata = sc.datasets.blobs(n_variables=10, n_centers=5, n_observations=200)
adata._sanitize()
adata.rename_categories('blobs', ['Zero', 'One', 'Two', 'Three', 'Four'])
bdata = adata.copy()
sc.tl.rank_genes_groups(
adata, 'blobs', groups=['Zero', 'One', 'Three'], method='logreg'
)
sc.tl.rank_genes_groups(
bdata, 'blobs', groups=['One', 'Three', 'Zero'], method='logreg'
)
array_ad = pd.DataFrame(
adata.uns['rank_genes_groups']['scores']['Three']
).to_numpy()
array_bd = pd.DataFrame(
bdata.uns['rank_genes_groups']['scores']['Three']
).to_numpy()
np.testing.assert_equal(array_ad, array_bd)
11 changes: 8 additions & 3 deletions scanpy/tools/_rank_genes_groups.py
Original file line number Diff line number Diff line change
Expand Up @@ -341,12 +341,17 @@ def logreg(self, **kwds):
clf = LogisticRegression(**kwds)
clf.fit(X, self.grouping.cat.codes)
scores_all = clf.coef_
for igroup, _ in enumerate(self.groups_order):
# not all codes necessarily appear in data
existing_codes = np.unique(self.grouping.cat.codes)
for igroup, cat in enumerate(self.groups_order):
if len(self.groups_order) <= 2: # binary logistic regression
scores = scores_all[0]
else:
scores = scores_all[igroup]

# cat code is index of cat value in .categories
cat_code: int = np.argmax(self.grouping.cat.categories == cat)
# index of scores row is index of cat code in array of existing codes
scores_idx: int = np.argmax(existing_codes == cat_code)
scores = scores_all[scores_idx]
yield igroup, scores, None

if len(self.groups_order) <= 2:
Expand Down

0 comments on commit fc80fee

Please sign in to comment.