diff --git a/docs/release-notes/1.9.7.md b/docs/release-notes/1.9.7.md index 2df82d9407..e8d626f6c4 100644 --- a/docs/release-notes/1.9.7.md +++ b/docs/release-notes/1.9.7.md @@ -5,3 +5,4 @@ - Fix handling of numpy array palettes (e.g. after write-read cycle) {pr}`2734` {smaller}`P Angerer` - Specify correct version of `matplotlib` dependency {pr}`2733` {smaller}`P Fisher` - Fix {func}`scanpy.pl.violin` usage of `seaborn.catplot` {pr}`2739` {smaller}`E Roellin` +- Fix {func}`scanpy.pp.highly_variable_genes` to handle the combinations of `inplace` and `subset` consistently {pr}`2757` {smaller}`E Roellin` diff --git a/scanpy/datasets/_datasets.py b/scanpy/datasets/_datasets.py index 20e20fc988..a3a8b40019 100644 --- a/scanpy/datasets/_datasets.py +++ b/scanpy/datasets/_datasets.py @@ -10,6 +10,7 @@ from .._settings import settings from ..readwrite import read, read_visium from ._utils import check_datasetdir_exists, filter_oldformatwarning +from .._utils import AnyRandom HERE = Path(__file__).parent @@ -19,6 +20,7 @@ def blobs( n_centers: int = 5, cluster_std: float = 1.0, n_observations: int = 640, + random_state: AnyRandom = 0, ) -> ad.AnnData: """\ Gaussian Blobs. @@ -34,6 +36,8 @@ def blobs( n_observations Number of observations. By default, this is the same observation number as in :func:`scanpy.datasets.krumsiek11`. + random_state + Determines random number generation for dataset creation. Returns ------- @@ -47,7 +51,7 @@ def blobs( n_features=n_variables, centers=n_centers, cluster_std=cluster_std, - random_state=0, + random_state=random_state, ) return ad.AnnData(X, obs=dict(blobs=y.astype(str))) diff --git a/scanpy/preprocessing/_highly_variable_genes.py b/scanpy/preprocessing/_highly_variable_genes.py index 45e83732ec..212d35a596 100644 --- a/scanpy/preprocessing/_highly_variable_genes.py +++ b/scanpy/preprocessing/_highly_variable_genes.py @@ -1,17 +1,17 @@ import warnings -from typing import Optional, Literal +from typing import Literal, Optional + import numpy as np import pandas as pd import scipy.sparse as sp_sparse from anndata import AnnData - from .. import logging as logg -from .._settings import settings, Verbosity -from .._utils import sanitize_anndata, check_nonnegative_integers -from ._utils import _get_mean_var +from .._settings import Verbosity, settings +from .._utils import check_nonnegative_integers, sanitize_anndata from ._distributed import materialize_as_ndarray from ._simple import filter_genes +from ._utils import _get_mean_var def _highly_variable_genes_seurat_v3( @@ -52,7 +52,7 @@ def _highly_variable_genes_seurat_v3( from skmisc.loess import loess except ImportError: raise ImportError( - 'Please install skmisc package via `pip install --user scikit-misc' + "Please install skmisc package via `pip install --user scikit-misc" ) df = pd.DataFrame(index=adata.var_names) X = adata.layers[layer] if layer is not None else adata.X @@ -63,7 +63,7 @@ def _highly_variable_genes_seurat_v3( UserWarning, ) - df['means'], df['variances'] = _get_mean_var(X) + df["means"], df["variances"] = _get_mean_var(X) if batch_key is None: batch_info = pd.Categorical(np.zeros(adata.shape[0], dtype=int)) @@ -128,48 +128,51 @@ def _highly_variable_genes_seurat_v3( ma_ranked = np.ma.masked_invalid(ranked_norm_gene_vars) median_ranked = np.ma.median(ma_ranked, axis=0).filled(np.nan) - df['highly_variable_nbatches'] = num_batches_high_var - df['highly_variable_rank'] = median_ranked - df['variances_norm'] = np.mean(norm_gene_vars, axis=0) + df["highly_variable_nbatches"] = num_batches_high_var + df["highly_variable_rank"] = median_ranked + df["variances_norm"] = np.mean(norm_gene_vars, axis=0) sorted_index = ( - df[['highly_variable_rank', 'highly_variable_nbatches']] + df[["highly_variable_rank", "highly_variable_nbatches"]] .sort_values( - ['highly_variable_rank', 'highly_variable_nbatches'], + ["highly_variable_rank", "highly_variable_nbatches"], ascending=[True, False], - na_position='last', + na_position="last", ) .index ) - df['highly_variable'] = False - df.loc[sorted_index[: int(n_top_genes)], 'highly_variable'] = True + df["highly_variable"] = False + df.loc[sorted_index[: int(n_top_genes)], "highly_variable"] = True - if inplace or subset: - adata.uns['hvg'] = {'flavor': 'seurat_v3'} + if inplace: + adata.uns["hvg"] = {"flavor": "seurat_v3"} logg.hint( - 'added\n' - ' \'highly_variable\', boolean vector (adata.var)\n' - ' \'highly_variable_rank\', float vector (adata.var)\n' - ' \'means\', float vector (adata.var)\n' - ' \'variances\', float vector (adata.var)\n' - ' \'variances_norm\', float vector (adata.var)' + "added\n" + " 'highly_variable', boolean vector (adata.var)\n" + " 'highly_variable_rank', float vector (adata.var)\n" + " 'means', float vector (adata.var)\n" + " 'variances', float vector (adata.var)\n" + " 'variances_norm', float vector (adata.var)" ) - adata.var['highly_variable'] = df['highly_variable'].values - adata.var['highly_variable_rank'] = df['highly_variable_rank'].values - adata.var['means'] = df['means'].values - adata.var['variances'] = df['variances'].values - adata.var['variances_norm'] = df['variances_norm'].values.astype( - 'float64', copy=False + adata.var["highly_variable"] = df["highly_variable"].values + adata.var["highly_variable_rank"] = df["highly_variable_rank"].values + adata.var["means"] = df["means"].values + adata.var["variances"] = df["variances"].values + adata.var["variances_norm"] = df["variances_norm"].values.astype( + "float64", copy=False ) if batch_key is not None: - adata.var['highly_variable_nbatches'] = df[ - 'highly_variable_nbatches' + adata.var["highly_variable_nbatches"] = df[ + "highly_variable_nbatches" ].values if subset: - adata._inplace_subset_var(df['highly_variable'].values) + adata._inplace_subset_var(df["highly_variable"].values) else: if batch_key is None: - df = df.drop(['highly_variable_nbatches'], axis=1) + df = df.drop(["highly_variable_nbatches"], axis=1) + if subset: + df = df.iloc[df.highly_variable.values, :] + return df @@ -182,7 +185,7 @@ def _highly_variable_genes_single_batch( max_mean: Optional[float] = 3, n_top_genes: Optional[int] = None, n_bins: int = 20, - flavor: Literal['seurat', 'cell_ranger'] = 'seurat', + flavor: Literal["seurat", "cell_ranger"] = "seurat", ) -> pd.DataFrame: """\ See `highly_variable_genes`. @@ -193,10 +196,10 @@ def _highly_variable_genes_single_batch( `highly_variable`, `means`, `dispersions`, and `dispersions_norm`. """ X = adata.layers[layer] if layer is not None else adata.X - if flavor == 'seurat': + if flavor == "seurat": X = X.copy() - if 'log1p' in adata.uns_keys() and adata.uns['log1p'].get('base') is not None: - X *= np.log(adata.uns['log1p']['base']) + if "log1p" in adata.uns_keys() and adata.uns["log1p"].get("base") is not None: + X *= np.log(adata.uns["log1p"]["base"]) # use out if possible. only possible since we copy X if isinstance(X, np.ndarray): np.expm1(X, out=X) @@ -207,29 +210,29 @@ def _highly_variable_genes_single_batch( # now actually compute the dispersion mean[mean == 0] = 1e-12 # set entries equal to zero to small value dispersion = var / mean - if flavor == 'seurat': # logarithmized mean as in Seurat + if flavor == "seurat": # logarithmized mean as in Seurat dispersion[dispersion == 0] = np.nan dispersion = np.log(dispersion) mean = np.log1p(mean) # all of the following quantities are "per-gene" here df = pd.DataFrame() - df['means'] = mean - df['dispersions'] = dispersion - if flavor == 'seurat': - df['mean_bin'] = pd.cut(df['means'], bins=n_bins) - disp_grouped = df.groupby('mean_bin')['dispersions'] + df["means"] = mean + df["dispersions"] = dispersion + if flavor == "seurat": + df["mean_bin"] = pd.cut(df["means"], bins=n_bins) + disp_grouped = df.groupby("mean_bin")["dispersions"] disp_mean_bin = disp_grouped.mean() disp_std_bin = disp_grouped.std(ddof=1) # retrieve those genes that have nan std, these are the ones where # only a single gene fell in the bin and implicitly set them to have # a normalized disperion of 1 one_gene_per_bin = disp_std_bin.isnull() - gen_indices = np.where(one_gene_per_bin[df['mean_bin'].values])[0].tolist() + gen_indices = np.where(one_gene_per_bin[df["mean_bin"].values])[0].tolist() if len(gen_indices) > 0: logg.debug( - f'Gene indices {gen_indices} fell into a single bin: their ' - 'normalized dispersion was set to 1.\n ' - 'Decreasing `n_bins` will likely avoid this effect.' + f"Gene indices {gen_indices} fell into a single bin: their " + "normalized dispersion was set to 1.\n " + "Decreasing `n_bins` will likely avoid this effect." ) # Circumvent pandas 0.23 bug. Both sides of the assignment have dtype==float32, # but there’s still a dtype error without “.value”. @@ -238,48 +241,48 @@ def _highly_variable_genes_single_batch( ].values disp_mean_bin[one_gene_per_bin.values] = 0 # actually do the normalization - df['dispersions_norm'] = ( - df['dispersions'].values # use values here as index differs - - disp_mean_bin[df['mean_bin'].values].values - ) / disp_std_bin[df['mean_bin'].values].values - elif flavor == 'cell_ranger': + df["dispersions_norm"] = ( + df["dispersions"].values # use values here as index differs + - disp_mean_bin[df["mean_bin"].values].values + ) / disp_std_bin[df["mean_bin"].values].values + elif flavor == "cell_ranger": from statsmodels import robust - df['mean_bin'] = pd.cut( - df['means'], - np.r_[-np.inf, np.percentile(df['means'], np.arange(10, 105, 5)), np.inf], + df["mean_bin"] = pd.cut( + df["means"], + np.r_[-np.inf, np.percentile(df["means"], np.arange(10, 105, 5)), np.inf], ) - disp_grouped = df.groupby('mean_bin')['dispersions'] + disp_grouped = df.groupby("mean_bin")["dispersions"] disp_median_bin = disp_grouped.median() # the next line raises the warning: "Mean of empty slice" with warnings.catch_warnings(): - warnings.simplefilter('ignore') + warnings.simplefilter("ignore") disp_mad_bin = disp_grouped.apply(robust.mad) - df['dispersions_norm'] = ( - df['dispersions'].values - disp_median_bin[df['mean_bin'].values].values - ) / disp_mad_bin[df['mean_bin'].values].values + df["dispersions_norm"] = ( + df["dispersions"].values - disp_median_bin[df["mean_bin"].values].values + ) / disp_mad_bin[df["mean_bin"].values].values else: raise ValueError('`flavor` needs to be "seurat" or "cell_ranger"') - dispersion_norm = df['dispersions_norm'].values + dispersion_norm = df["dispersions_norm"].values if n_top_genes is not None: dispersion_norm = dispersion_norm[~np.isnan(dispersion_norm)] dispersion_norm[ ::-1 ].sort() # interestingly, np.argpartition is slightly slower if n_top_genes > adata.n_vars: - logg.info('`n_top_genes` > `adata.n_var`, returning all genes.') + logg.info("`n_top_genes` > `adata.n_var`, returning all genes.") n_top_genes = adata.n_vars if n_top_genes > dispersion_norm.size: warnings.warn( - '`n_top_genes` > number of normalized dispersions, returning all genes with normalized dispersions.', + "`n_top_genes` > number of normalized dispersions, returning all genes with normalized dispersions.", UserWarning, ) n_top_genes = dispersion_norm.size disp_cut_off = dispersion_norm[n_top_genes - 1] - gene_subset = np.nan_to_num(df['dispersions_norm'].values) >= disp_cut_off + gene_subset = np.nan_to_num(df["dispersions_norm"].values) >= disp_cut_off logg.debug( - f'the {n_top_genes} top genes correspond to a ' - f'normalized dispersion cutoff of {disp_cut_off}' + f"the {n_top_genes} top genes correspond to a " + f"normalized dispersion cutoff of {disp_cut_off}" ) else: dispersion_norm[np.isnan(dispersion_norm)] = 0 # similar to Seurat @@ -292,7 +295,7 @@ def _highly_variable_genes_single_batch( ) ) - df['highly_variable'] = gene_subset + df["highly_variable"] = gene_subset return df @@ -306,7 +309,7 @@ def highly_variable_genes( max_mean: Optional[float] = 3, span: Optional[float] = 0.3, n_bins: int = 20, - flavor: Literal['seurat', 'cell_ranger', 'seurat_v3'] = 'seurat', + flavor: Literal["seurat", "cell_ranger", "seurat_v3"] = "seurat", subset: bool = False, inplace: bool = True, batch_key: Optional[str] = None, @@ -418,17 +421,17 @@ def highly_variable_genes( if n_top_genes is not None and not all( m is None for m in [min_disp, max_disp, min_mean, max_mean] ): - logg.info('If you pass `n_top_genes`, all cutoffs are ignored.') + logg.info("If you pass `n_top_genes`, all cutoffs are ignored.") - start = logg.info('extracting highly variable genes') + start = logg.info("extracting highly variable genes") if not isinstance(adata, AnnData): raise ValueError( - '`pp.highly_variable_genes` expects an `AnnData` argument, ' - 'pass `inplace=False` if you want to return a `pd.DataFrame`.' + "`pp.highly_variable_genes` expects an `AnnData` argument, " + "pass `inplace=False` if you want to return a `pd.DataFrame`." ) - if flavor == 'seurat_v3': + if flavor == "seurat_v3": return _highly_variable_genes_seurat_v3( adata, layer=layer, @@ -483,9 +486,9 @@ def highly_variable_genes( np.zeros((np.sum(~filt), len(hvg.columns))), columns=hvg.columns, ) - missing_hvg['highly_variable'] = missing_hvg['highly_variable'].astype(bool) - missing_hvg['gene'] = gene_list[~filt] - hvg['gene'] = adata_subset.var_names.values + missing_hvg["highly_variable"] = missing_hvg["highly_variable"].astype(bool) + missing_hvg["gene"] = gene_list[~filt] + hvg["gene"] = adata_subset.var_names.values hvg = pd.concat([hvg, missing_hvg], ignore_index=True) # Order as before filtering @@ -495,8 +498,8 @@ def highly_variable_genes( df.append(hvg) df = pd.concat(df, axis=0) - df['highly_variable'] = df['highly_variable'].astype(int) - df = df.groupby('gene').agg( + df["highly_variable"] = df["highly_variable"].astype(int) + df = df.groupby("gene").agg( dict( means=np.nanmean, dispersions=np.nanmean, @@ -505,9 +508,9 @@ def highly_variable_genes( ) ) df.rename( - columns=dict(highly_variable='highly_variable_nbatches'), inplace=True + columns=dict(highly_variable="highly_variable_nbatches"), inplace=True ) - df['highly_variable_intersection'] = df['highly_variable_nbatches'] == len( + df["highly_variable_intersection"] = df["highly_variable_nbatches"] == len( batches ) @@ -515,14 +518,14 @@ def highly_variable_genes( # sort genes by how often they selected as hvg within each batch and # break ties with normalized dispersion across batches df.sort_values( - ['highly_variable_nbatches', 'dispersions_norm'], + ["highly_variable_nbatches", "dispersions_norm"], ascending=False, - na_position='last', + na_position="last", inplace=True, ) high_var = np.zeros(df.shape[0]) high_var[:n_top_genes] = True - df['highly_variable'] = high_var.astype(bool) + df["highly_variable"] = high_var.astype(bool) df = df.loc[adata.var_names, :] else: df = df.loc[adata.var_names] @@ -536,33 +539,38 @@ def highly_variable_genes( df.dispersions_norm < max_disp, ) ) - df['highly_variable'] = gene_subset + df["highly_variable"] = gene_subset - logg.info(' finished', time=start) + logg.info(" finished", time=start) - if inplace or subset: - adata.uns['hvg'] = {'flavor': flavor} + if inplace: + adata.uns["hvg"] = {"flavor": flavor} logg.hint( - 'added\n' - ' \'highly_variable\', boolean vector (adata.var)\n' - ' \'means\', float vector (adata.var)\n' - ' \'dispersions\', float vector (adata.var)\n' - ' \'dispersions_norm\', float vector (adata.var)' + "added\n" + " 'highly_variable', boolean vector (adata.var)\n" + " 'means', float vector (adata.var)\n" + " 'dispersions', float vector (adata.var)\n" + " 'dispersions_norm', float vector (adata.var)" ) - adata.var['highly_variable'] = df['highly_variable'].values - adata.var['means'] = df['means'].values - adata.var['dispersions'] = df['dispersions'].values - adata.var['dispersions_norm'] = df['dispersions_norm'].values.astype( - 'float32', copy=False + adata.var["highly_variable"] = df["highly_variable"].values + adata.var["means"] = df["means"].values + adata.var["dispersions"] = df["dispersions"].values + adata.var["dispersions_norm"] = df["dispersions_norm"].values.astype( + "float32", copy=False ) + if batch_key is not None: - adata.var['highly_variable_nbatches'] = df[ - 'highly_variable_nbatches' + adata.var["highly_variable_nbatches"] = df[ + "highly_variable_nbatches" ].values - adata.var['highly_variable_intersection'] = df[ - 'highly_variable_intersection' + adata.var["highly_variable_intersection"] = df[ + "highly_variable_intersection" ].values if subset: - adata._inplace_subset_var(df['highly_variable'].values) + adata._inplace_subset_var(df["highly_variable"].values) + else: + if subset: + df = df.iloc[df.highly_variable.values, :] + return df diff --git a/scanpy/tests/test_highly_variable_genes.py b/scanpy/tests/test_highly_variable_genes.py index a46ef0eee1..722857285d 100644 --- a/scanpy/tests/test_highly_variable_genes.py +++ b/scanpy/tests/test_highly_variable_genes.py @@ -512,3 +512,38 @@ def test_cellranger_n_top_genes_warning(): match="`n_top_genes` > number of normalized dispersions, returning all genes with normalized dispersions.", ): sc.pp.highly_variable_genes(adata, n_top_genes=1000, flavor="cell_ranger") + + +@pytest.mark.parametrize("flavor", ["seurat", "cell_ranger"]) +@pytest.mark.parametrize("subset", [True, False]) +@pytest.mark.parametrize("inplace", [True, False]) +def test_highly_variable_genes_subset_inplace_consistency( + flavor, + subset, + inplace, +): + adata = sc.datasets.blobs(n_observations=20, n_variables=80, random_state=0) + adata.X = np.abs(adata.X).astype(int) + + if flavor == "seurat" or flavor == "cell_ranger": + sc.pp.normalize_total(adata, target_sum=1e4) + sc.pp.log1p(adata) + + elif flavor == "seurat_v3": + pass + + else: + raise ValueError(f"Unknown flavor {flavor}") + + n_genes = adata.shape[1] + + output_df = sc.pp.highly_variable_genes( + adata, + flavor=flavor, + n_top_genes=15, + subset=subset, + inplace=inplace, + ) + + assert (output_df is None) == inplace + assert len(adata.var if inplace else output_df) == (15 if subset else n_genes)