From 1daae5b019755b0345ebdfb86be9a88ca292fd8f Mon Sep 17 00:00:00 2001 From: Philipp A Date: Tue, 19 Dec 2023 11:28:35 +0100 Subject: [PATCH] Backport PR #2782: Fix Seurat n_top_genes default (#2783) --- docs/release-notes/1.9.7.md | 2 + .../preprocessing/_highly_variable_genes.py | 22 +- scanpy/tests/test_highly_variable_genes.py | 280 +++++++++--------- 3 files changed, 159 insertions(+), 145 deletions(-) diff --git a/docs/release-notes/1.9.7.md b/docs/release-notes/1.9.7.md index e8d626f6c4..8fa8487b8e 100644 --- a/docs/release-notes/1.9.7.md +++ b/docs/release-notes/1.9.7.md @@ -6,3 +6,5 @@ - Specify correct version of `matplotlib` dependency {pr}`2733` {smaller}`P Fisher` - Fix {func}`scanpy.pl.violin` usage of `seaborn.catplot` {pr}`2739` {smaller}`E Roellin` - Fix {func}`scanpy.pp.highly_variable_genes` to handle the combinations of `inplace` and `subset` consistently {pr}`2757` {smaller}`E Roellin` +- Replace usage of various deprecated functionality from {mod}`anndata` and {mod}`pandas` {pr}`2678` {pr}`2779` {smaller}`P Angerer` +- Allow to use default `n_top_genes` when using {func}`scanpy.pp.highly_variable_genes` flavor `'seurat_v3'` {pr}`2782` {smaller}`P Angerer` diff --git a/scanpy/preprocessing/_highly_variable_genes.py b/scanpy/preprocessing/_highly_variable_genes.py index 212d35a596..6ddbd58f33 100644 --- a/scanpy/preprocessing/_highly_variable_genes.py +++ b/scanpy/preprocessing/_highly_variable_genes.py @@ -1,5 +1,8 @@ +from __future__ import annotations + import warnings -from typing import Literal, Optional +from inspect import signature +from typing import Literal, Optional, cast import numpy as np import pandas as pd @@ -301,13 +304,13 @@ def _highly_variable_genes_single_batch( def highly_variable_genes( adata: AnnData, - layer: Optional[str] = None, - n_top_genes: Optional[int] = None, - min_disp: Optional[float] = 0.5, - max_disp: Optional[float] = np.inf, - min_mean: Optional[float] = 0.0125, - max_mean: Optional[float] = 3, - span: Optional[float] = 0.3, + layer: str | None = None, + n_top_genes: int | None = None, + min_disp: float | None = 0.5, + max_disp: float | None = np.inf, + min_mean: float | None = 0.0125, + max_mean: float | None = 3, + span: float = 0.3, n_bins: int = 20, flavor: Literal["seurat", "cell_ranger", "seurat_v3"] = "seurat", subset: bool = False, @@ -432,6 +435,9 @@ def highly_variable_genes( ) if flavor == "seurat_v3": + if n_top_genes is None: + sig = signature(_highly_variable_genes_seurat_v3) + n_top_genes = cast(int, sig.parameters["n_top_genes"].default) return _highly_variable_genes_seurat_v3( adata, layer=layer, diff --git a/scanpy/tests/test_highly_variable_genes.py b/scanpy/tests/test_highly_variable_genes.py index 722857285d..9e3e5bc1a7 100644 --- a/scanpy/tests/test_highly_variable_genes.py +++ b/scanpy/tests/test_highly_variable_genes.py @@ -1,15 +1,17 @@ -import pytest -import pandas as pd +from pathlib import Path + import numpy as np +import pandas as pd +import pytest + import scanpy as sc -from pathlib import Path from scanpy.testing._helpers import _check_check_values_warnings from scanpy.testing._helpers.data import pbmc3k, pbmc68k_reduced from scanpy.testing._pytest.marks import needs -FILE = Path(__file__).parent / Path('_scripts/seurat_hvg.csv') -FILE_V3 = Path(__file__).parent / Path('_scripts/seurat_hvg_v3.csv.gz') -FILE_V3_BATCH = Path(__file__).parent / Path('_scripts/seurat_hvg_v3_batch.csv') +FILE = Path(__file__).parent / Path("_scripts/seurat_hvg.csv") +FILE_V3 = Path(__file__).parent / Path("_scripts/seurat_hvg_v3.csv.gz") +FILE_V3_BATCH = Path(__file__).parent / Path("_scripts/seurat_hvg_v3_batch.csv") def test_highly_variable_genes_basic(): @@ -18,40 +20,40 @@ def test_highly_variable_genes_basic(): adata = sc.datasets.blobs() np.random.seed(0) - adata.obs['batch'] = np.random.binomial(3, 0.5, size=(adata.n_obs)) - adata.obs['batch'] = adata.obs['batch'].astype('category') - sc.pp.highly_variable_genes(adata, batch_key='batch') - assert 'highly_variable_nbatches' in adata.var.columns - assert 'highly_variable_intersection' in adata.var.columns + adata.obs["batch"] = np.random.binomial(3, 0.5, size=(adata.n_obs)) + adata.obs["batch"] = adata.obs["batch"].astype("category") + sc.pp.highly_variable_genes(adata, batch_key="batch") + assert "highly_variable_nbatches" in adata.var.columns + assert "highly_variable_intersection" in adata.var.columns adata = sc.datasets.blobs() batch = np.random.binomial(4, 0.5, size=(adata.n_obs)) - adata.obs['batch'] = batch - adata.obs['batch'] = adata.obs['batch'].astype('category') - sc.pp.highly_variable_genes(adata, batch_key='batch', n_top_genes=3) - assert 'highly_variable_nbatches' in adata.var.columns - assert adata.var['highly_variable'].sum() == 3 - highly_var_first_layer = adata.var['highly_variable'] + adata.obs["batch"] = batch + adata.obs["batch"] = adata.obs["batch"].astype("category") + sc.pp.highly_variable_genes(adata, batch_key="batch", n_top_genes=3) + assert "highly_variable_nbatches" in adata.var.columns + assert adata.var["highly_variable"].sum() == 3 + highly_var_first_layer = adata.var["highly_variable"] adata = sc.datasets.blobs() new_layer = adata.X.copy() np.random.shuffle(new_layer) - adata.layers['test_layer'] = new_layer - adata.obs['batch'] = batch - adata.obs['batch'] = adata.obs['batch'].astype('category') + adata.layers["test_layer"] = new_layer + adata.obs["batch"] = batch + adata.obs["batch"] = adata.obs["batch"].astype("category") sc.pp.highly_variable_genes( - adata, batch_key='batch', n_top_genes=3, layer='test_layer' + adata, batch_key="batch", n_top_genes=3, layer="test_layer" ) - assert 'highly_variable_nbatches' in adata.var.columns - assert adata.var['highly_variable'].sum() == 3 - assert (highly_var_first_layer != adata.var['highly_variable']).any() + assert "highly_variable_nbatches" in adata.var.columns + assert adata.var["highly_variable"].sum() == 3 + assert (highly_var_first_layer != adata.var["highly_variable"]).any() sc.pp.highly_variable_genes(adata) no_batch_hvg = adata.var.highly_variable.copy() assert no_batch_hvg.any() - adata.obs['batch'] = 'batch' - adata.obs['batch'] = adata.obs['batch'].astype('category') - sc.pp.highly_variable_genes(adata, batch_key='batch') + adata.obs["batch"] = "batch" + adata.obs["batch"] = adata.obs["batch"].astype("category") + sc.pp.highly_variable_genes(adata, batch_key="batch") assert np.all(no_batch_hvg == adata.var.highly_variable) assert np.all(adata.var.highly_variable_intersection == adata.var.highly_variable) @@ -61,19 +63,19 @@ def test_highly_variable_genes_basic(): assert adata.var["highly_variable"].any() colnames = [ - 'means', - 'dispersions', - 'dispersions_norm', - 'highly_variable_nbatches', - 'highly_variable_intersection', - 'highly_variable', + "means", + "dispersions", + "dispersions_norm", + "highly_variable_nbatches", + "highly_variable_intersection", + "highly_variable", ] hvg_df = sc.pp.highly_variable_genes(adata, batch_key="batch", inplace=False) assert np.all(np.isin(colnames, hvg_df.columns)) -@pytest.mark.parametrize('base', [None, 10]) -@pytest.mark.parametrize('flavor', ['seurat', 'cell_ranger']) +@pytest.mark.parametrize("base", [None, 10]) +@pytest.mark.parametrize("flavor", ["seurat", "cell_ranger"]) def test_highly_variable_genes_keep_layer(base, flavor): adata = pbmc3k() # cell_ranger flavor can raise error if many 0 genes @@ -82,9 +84,9 @@ def test_highly_variable_genes_keep_layer(base, flavor): sc.pp.log1p(adata, base=base) X_orig = adata.X.copy() - if flavor == 'seurat': + if flavor == "seurat": sc.pp.highly_variable_genes(adata, n_top_genes=50, flavor=flavor) - elif flavor == 'cell_ranger': + elif flavor == "cell_ranger": sc.pp.highly_variable_genes(adata, flavor=flavor) else: assert False @@ -93,19 +95,19 @@ def test_highly_variable_genes_keep_layer(base, flavor): def _check_pearson_hvg_columns(output_df, n_top_genes): - assert pd.api.types.is_float_dtype(output_df['residual_variances'].dtype) + assert pd.api.types.is_float_dtype(output_df["residual_variances"].dtype) - assert output_df['highly_variable'].values.dtype is np.dtype('bool') - assert np.sum(output_df['highly_variable']) == n_top_genes + assert output_df["highly_variable"].values.dtype is np.dtype("bool") + assert np.sum(output_df["highly_variable"]) == n_top_genes - assert np.nanmax(output_df['highly_variable_rank'].values) <= n_top_genes - 1 + assert np.nanmax(output_df["highly_variable_rank"].values) <= n_top_genes - 1 def test_highly_variable_genes_pearson_residuals_inputchecks(pbmc3k_parametrized_small): adata = pbmc3k_parametrized_small() # depending on check_values, warnings should be raised for non-integer data - if adata.X.dtype == 'float32': + if adata.X.dtype == "float32": adata_noninteger = adata.copy() x, y = np.nonzero(adata_noninteger.X) adata_noninteger.X[x[0], y[0]] = 0.5 @@ -115,30 +117,30 @@ def test_highly_variable_genes_pearson_residuals_inputchecks(pbmc3k_parametrized adata=adata_noninteger, expected_warning="`flavor='pearson_residuals'` expects raw count data, but non-integers were found.", kwargs=dict( - flavor='pearson_residuals', + flavor="pearson_residuals", n_top_genes=100, ), ) # errors should be raised for invalid theta values for theta in [0, -1]: - with pytest.raises(ValueError, match='Pearson residuals require theta > 0'): + with pytest.raises(ValueError, match="Pearson residuals require theta > 0"): sc.experimental.pp.highly_variable_genes( - adata.copy(), theta=theta, flavor='pearson_residuals', n_top_genes=100 + adata.copy(), theta=theta, flavor="pearson_residuals", n_top_genes=100 ) with pytest.raises( - ValueError, match='Pearson residuals require `clip>=0` or `clip=None`.' + ValueError, match="Pearson residuals require `clip>=0` or `clip=None`." ): sc.experimental.pp.highly_variable_genes( - adata.copy(), clip=-1, flavor='pearson_residuals', n_top_genes=100 + adata.copy(), clip=-1, flavor="pearson_residuals", n_top_genes=100 ) -@pytest.mark.parametrize('subset', [True, False]) -@pytest.mark.parametrize('clip', [None, np.Inf, 30]) -@pytest.mark.parametrize('theta', [100, np.Inf]) -@pytest.mark.parametrize('n_top_genes', [100, 200]) +@pytest.mark.parametrize("subset", [True, False]) +@pytest.mark.parametrize("clip", [None, np.Inf, 30]) +@pytest.mark.parametrize("theta", [100, np.Inf]) +@pytest.mark.parametrize("n_top_genes", [100, 200]) def test_highly_variable_genes_pearson_residuals_general( pbmc3k_parametrized_small, subset, clip, theta, n_top_genes ): @@ -149,7 +151,7 @@ def test_highly_variable_genes_pearson_residuals_general( # compute reference output residuals_reference = sc.experimental.pp.normalize_pearson_residuals( adata, clip=clip, theta=theta, inplace=False - )['X'] + )["X"] residual_variances_reference = np.var(residuals_reference, axis=0) if subset: @@ -161,7 +163,7 @@ def test_highly_variable_genes_pearson_residuals_general( # compute output to be tested output_df = sc.experimental.pp.highly_variable_genes( adata, - flavor='pearson_residuals', + flavor="pearson_residuals", n_top_genes=n_top_genes, subset=subset, inplace=False, @@ -171,7 +173,7 @@ def test_highly_variable_genes_pearson_residuals_general( sc.experimental.pp.highly_variable_genes( adata, - flavor='pearson_residuals', + flavor="pearson_residuals", n_top_genes=n_top_genes, subset=subset, inplace=True, @@ -184,43 +186,43 @@ def test_highly_variable_genes_pearson_residuals_general( # check output is complete for key in [ - 'highly_variable', - 'means', - 'variances', - 'residual_variances', - 'highly_variable_rank', + "highly_variable", + "means", + "variances", + "residual_variances", + "highly_variable_rank", ]: assert key in output_df.keys() # check consistency with normalization method if subset: # sort values before comparing as reference is sorted as well for subset case - sort_output_idx = np.argsort(-output_df['residual_variances'].values) + sort_output_idx = np.argsort(-output_df["residual_variances"].values) assert np.allclose( - output_df['residual_variances'].values[sort_output_idx], + output_df["residual_variances"].values[sort_output_idx], residual_variances_reference, ) else: assert np.allclose( - output_df['residual_variances'].values, residual_variances_reference + output_df["residual_variances"].values, residual_variances_reference ) # check hvg flag - hvg_idx = np.where(output_df['highly_variable'])[0] + hvg_idx = np.where(output_df["highly_variable"])[0] topn_idx = np.sort( - np.argsort(-output_df['residual_variances'].values)[:n_top_genes] + np.argsort(-output_df["residual_variances"].values)[:n_top_genes] ) assert np.all(hvg_idx == topn_idx) # check ranks - assert np.nanmin(output_df['highly_variable_rank'].values) == 0 + assert np.nanmin(output_df["highly_variable_rank"].values) == 0 # more general checks on ranks, hvg flag and residual variance _check_pearson_hvg_columns(output_df, n_top_genes) -@pytest.mark.parametrize('subset', [True, False]) -@pytest.mark.parametrize('n_top_genes', [100, 200]) +@pytest.mark.parametrize("subset", [True, False]) +@pytest.mark.parametrize("n_top_genes", [100, 200]) def test_highly_variable_genes_pearson_residuals_batch( pbmc3k_parametrized_small, subset, n_top_genes ): @@ -231,18 +233,18 @@ def test_highly_variable_genes_pearson_residuals_batch( output_df = sc.experimental.pp.highly_variable_genes( adata, - flavor='pearson_residuals', + flavor="pearson_residuals", n_top_genes=n_top_genes, - batch_key='batch', + batch_key="batch", subset=subset, inplace=False, ) sc.experimental.pp.highly_variable_genes( adata, - flavor='pearson_residuals', + flavor="pearson_residuals", n_top_genes=n_top_genes, - batch_key='batch', + batch_key="batch", subset=subset, inplace=True, ) @@ -252,13 +254,13 @@ def test_highly_variable_genes_pearson_residuals_batch( # check output is complete for key in [ - 'highly_variable', - 'means', - 'variances', - 'residual_variances', - 'highly_variable_rank', - 'highly_variable_nbatches', - 'highly_variable_intersection', + "highly_variable", + "means", + "variances", + "residual_variances", + "highly_variable_rank", + "highly_variable_nbatches", + "highly_variable_intersection", ]: assert key in output_df.keys() @@ -266,18 +268,18 @@ def test_highly_variable_genes_pearson_residuals_batch( _check_pearson_hvg_columns(output_df, n_top_genes) # check intersection flag - nbatches = len(np.unique(adata.obs['batch'])) - assert output_df['highly_variable_intersection'].values.dtype is np.dtype('bool') - assert np.sum(output_df['highly_variable_intersection']) <= n_top_genes * nbatches - assert np.all(output_df['highly_variable'][output_df.highly_variable_intersection]) + nbatches = len(np.unique(adata.obs["batch"])) + assert output_df["highly_variable_intersection"].values.dtype is np.dtype("bool") + assert np.sum(output_df["highly_variable_intersection"]) <= n_top_genes * nbatches + assert np.all(output_df["highly_variable"][output_df.highly_variable_intersection]) # check ranks (with batch_key these are the median of within-batch ranks) - assert pd.api.types.is_float_dtype(output_df['highly_variable_rank'].dtype) + assert pd.api.types.is_float_dtype(output_df["highly_variable_rank"].dtype) # check nbatches - assert output_df['highly_variable_nbatches'].values.dtype is np.dtype('int') - assert np.min(output_df['highly_variable_nbatches'].values) >= 0 - assert np.max(output_df['highly_variable_nbatches'].values) <= nbatches + assert output_df["highly_variable_nbatches"].values.dtype is np.dtype("int") + assert np.min(output_df["highly_variable_nbatches"].values) >= 0 + assert np.max(output_df["highly_variable_nbatches"].values) <= nbatches # check subsetting if subset: @@ -287,7 +289,7 @@ def test_highly_variable_genes_pearson_residuals_batch( def test_higly_variable_genes_compare_to_seurat(): - seurat_hvg_info = pd.read_csv(FILE, sep=' ') + seurat_hvg_info = pd.read_csv(FILE, sep=" ") pbmc = pbmc68k_reduced() pbmc.X = pbmc.raw.X @@ -296,30 +298,30 @@ def test_higly_variable_genes_compare_to_seurat(): sc.pp.normalize_per_cell(pbmc, counts_per_cell_after=1e4) sc.pp.log1p(pbmc) sc.pp.highly_variable_genes( - pbmc, flavor='seurat', min_mean=0.0125, max_mean=3, min_disp=0.5, inplace=True + pbmc, flavor="seurat", min_mean=0.0125, max_mean=3, min_disp=0.5, inplace=True ) np.testing.assert_array_equal( - seurat_hvg_info['highly_variable'], pbmc.var['highly_variable'] + seurat_hvg_info["highly_variable"], pbmc.var["highly_variable"] ) # (still) Not equal to tolerance rtol=2e-05, atol=2e-05 # np.testing.assert_allclose(4, 3.9999, rtol=2e-05, atol=2e-05) np.testing.assert_allclose( - seurat_hvg_info['means'], - pbmc.var['means'], + seurat_hvg_info["means"], + pbmc.var["means"], rtol=2e-05, atol=2e-05, ) np.testing.assert_allclose( - seurat_hvg_info['dispersions'], - pbmc.var['dispersions'], + seurat_hvg_info["dispersions"], + pbmc.var["dispersions"], rtol=2e-05, atol=2e-05, ) np.testing.assert_allclose( - seurat_hvg_info['dispersions_norm'], - pbmc.var['dispersions_norm'], + seurat_hvg_info["dispersions_norm"], + pbmc.var["dispersions_norm"], rtol=2e-05, atol=2e-05, ) @@ -328,7 +330,7 @@ def test_higly_variable_genes_compare_to_seurat(): @needs("skmisc") def test_higly_variable_genes_compare_to_seurat_v3(): seurat_hvg_info = pd.read_csv( - FILE_V3, sep=' ', dtype={"variances_norm": np.float64} + FILE_V3, sep=" ", dtype={"variances_norm": np.float64} ) pbmc = pbmc3k() @@ -337,27 +339,27 @@ def test_higly_variable_genes_compare_to_seurat_v3(): pbmc_dense = pbmc.copy() pbmc_dense.X = pbmc_dense.X.toarray() - sc.pp.highly_variable_genes(pbmc, n_top_genes=1000, flavor='seurat_v3') - sc.pp.highly_variable_genes(pbmc_dense, n_top_genes=1000, flavor='seurat_v3') + sc.pp.highly_variable_genes(pbmc, n_top_genes=1000, flavor="seurat_v3") + sc.pp.highly_variable_genes(pbmc_dense, n_top_genes=1000, flavor="seurat_v3") np.testing.assert_array_equal( - seurat_hvg_info['highly_variable'], pbmc.var['highly_variable'] + seurat_hvg_info["highly_variable"], pbmc.var["highly_variable"] ) np.testing.assert_allclose( - seurat_hvg_info['variances'], - pbmc.var['variances'], + seurat_hvg_info["variances"], + pbmc.var["variances"], rtol=2e-05, atol=2e-05, ) np.testing.assert_allclose( - seurat_hvg_info['variances_norm'], - pbmc.var['variances_norm'], + seurat_hvg_info["variances_norm"], + pbmc.var["variances_norm"], rtol=2e-05, atol=2e-05, ) np.testing.assert_allclose( - pbmc_dense.var['variances_norm'], - pbmc.var['variances_norm'], + pbmc_dense.var["variances_norm"], + pbmc.var["variances_norm"], rtol=2e-05, atol=2e-05, ) @@ -366,7 +368,7 @@ def test_higly_variable_genes_compare_to_seurat_v3(): batch[1500:] = 1 pbmc.obs["batch"] = batch df = sc.pp.highly_variable_genes( - pbmc, n_top_genes=4000, flavor='seurat_v3', batch_key="batch", inplace=False + pbmc, n_top_genes=4000, flavor="seurat_v3", batch_key="batch", inplace=False ) df.sort_values( ["highly_variable_nbatches", "highly_variable_rank"], @@ -376,23 +378,27 @@ def test_higly_variable_genes_compare_to_seurat_v3(): ) df = df.iloc[:4000] seurat_hvg_info_batch = pd.read_csv( - FILE_V3_BATCH, sep=' ', dtype={"variances_norm": np.float64} + FILE_V3_BATCH, sep=" ", dtype={"variances_norm": np.float64} ) # ranks might be slightly different due to many genes having same normalized var - seu = pd.Index(seurat_hvg_info_batch['x'].values) + seu = pd.Index(seurat_hvg_info_batch["x"].values) assert len(seu.intersection(df.index)) / 4000 > 0.95 + +@needs("skmisc") +def test_higly_variable_genes_seurat_v3_warning(): + pbmc = pbmc3k()[:200].copy() sc.pp.log1p(pbmc) with pytest.warns( UserWarning, match="`flavor='seurat_v3'` expects raw count data, but non-integers were found.", ): - sc.pp.highly_variable_genes(pbmc, n_top_genes=1000, flavor='seurat_v3') + sc.pp.highly_variable_genes(pbmc, flavor="seurat_v3") def test_filter_genes_dispersion_compare_to_seurat(): - seurat_hvg_info = pd.read_csv(FILE, sep=' ') + seurat_hvg_info = pd.read_csv(FILE, sep=" ") pbmc = pbmc68k_reduced() pbmc.X = pbmc.raw.X @@ -401,7 +407,7 @@ def test_filter_genes_dispersion_compare_to_seurat(): sc.pp.normalize_per_cell(pbmc, counts_per_cell_after=1e4) sc.pp.filter_genes_dispersion( pbmc, - flavor='seurat', + flavor="seurat", log=True, subset=False, min_mean=0.0125, @@ -410,26 +416,26 @@ def test_filter_genes_dispersion_compare_to_seurat(): ) np.testing.assert_array_equal( - seurat_hvg_info['highly_variable'], pbmc.var['highly_variable'] + seurat_hvg_info["highly_variable"], pbmc.var["highly_variable"] ) # (still) Not equal to tolerance rtol=2e-05, atol=2e-05: # np.testing.assert_allclose(4, 3.9999, rtol=2e-05, atol=2e-05) np.testing.assert_allclose( - seurat_hvg_info['means'], - pbmc.var['means'], + seurat_hvg_info["means"], + pbmc.var["means"], rtol=2e-05, atol=2e-05, ) np.testing.assert_allclose( - seurat_hvg_info['dispersions'], - pbmc.var['dispersions'], + seurat_hvg_info["dispersions"], + pbmc.var["dispersions"], rtol=2e-05, atol=2e-05, ) np.testing.assert_allclose( - seurat_hvg_info['dispersions_norm'], - pbmc.var['dispersions_norm'], + seurat_hvg_info["dispersions_norm"], + pbmc.var["dispersions_norm"], rtol=2e-05, atol=2e-05, ) @@ -439,43 +445,43 @@ def test_highly_variable_genes_batches(): adata = pbmc68k_reduced() adata[:100, :100].X = np.zeros((100, 100)) - adata.obs['batch'] = ['0' if i < 100 else '1' for i in range(adata.n_obs)] - adata_1 = adata[adata.obs.batch.isin(['0']), :] - adata_2 = adata[adata.obs.batch.isin(['1']), :] + adata.obs["batch"] = ["0" if i < 100 else "1" for i in range(adata.n_obs)] + adata_1 = adata[adata.obs.batch.isin(["0"]), :] + adata_2 = adata[adata.obs.batch.isin(["1"]), :] sc.pp.highly_variable_genes( adata, - batch_key='batch', - flavor='cell_ranger', + batch_key="batch", + flavor="cell_ranger", n_top_genes=200, ) sc.pp.filter_genes(adata_1, min_cells=1) sc.pp.filter_genes(adata_2, min_cells=1) hvg1 = sc.pp.highly_variable_genes( - adata_1, flavor='cell_ranger', n_top_genes=200, inplace=False + adata_1, flavor="cell_ranger", n_top_genes=200, inplace=False ) hvg2 = sc.pp.highly_variable_genes( - adata_2, flavor='cell_ranger', n_top_genes=200, inplace=False + adata_2, flavor="cell_ranger", n_top_genes=200, inplace=False ) assert np.isclose( - adata.var['dispersions_norm'][100], - 0.5 * hvg1['dispersions_norm'][0] + 0.5 * hvg2['dispersions_norm'][100], + adata.var["dispersions_norm"][100], + 0.5 * hvg1["dispersions_norm"][0] + 0.5 * hvg2["dispersions_norm"][100], ) assert np.isclose( - adata.var['dispersions_norm'][101], - 0.5 * hvg1['dispersions_norm'][1] + 0.5 * hvg2['dispersions_norm'][101], + adata.var["dispersions_norm"][101], + 0.5 * hvg1["dispersions_norm"][1] + 0.5 * hvg2["dispersions_norm"][101], ) assert np.isclose( - adata.var['dispersions_norm'][0], 0.5 * hvg2['dispersions_norm'][0] + adata.var["dispersions_norm"][0], 0.5 * hvg2["dispersions_norm"][0] ) colnames = [ - 'means', - 'dispersions', - 'dispersions_norm', - 'highly_variable', + "means", + "dispersions", + "dispersions_norm", + "highly_variable", ] assert np.all(np.isin(colnames, hvg1.columns)) @@ -495,10 +501,10 @@ def test_seurat_v3_mean_var_output_with_batchkey(): true_var = np.var(pbmc.X.toarray(), axis=0, dtype=np.float64, ddof=1) result_df = sc.pp.highly_variable_genes( - pbmc, batch_key='batch', flavor='seurat_v3', n_top_genes=4000, inplace=False + pbmc, batch_key="batch", flavor="seurat_v3", n_top_genes=4000, inplace=False ) - np.testing.assert_allclose(true_mean, result_df['means'], rtol=2e-05, atol=2e-05) - np.testing.assert_allclose(true_var, result_df['variances'], rtol=2e-05, atol=2e-05) + np.testing.assert_allclose(true_mean, result_df["means"], rtol=2e-05, atol=2e-05) + np.testing.assert_allclose(true_var, result_df["variances"], rtol=2e-05, atol=2e-05) def test_cellranger_n_top_genes_warning():