diff --git a/diffxpy/api/__init__.py b/diffxpy/api/__init__.py index daaeb6a..a1c1dad 100644 --- a/diffxpy/api/__init__.py +++ b/diffxpy/api/__init__.py @@ -5,3 +5,4 @@ from . import enrich from . import stats from . import utils +from .. import pkg_constants diff --git a/diffxpy/api/stats.py b/diffxpy/api/stats.py index 8745f4a..4faaab0 100644 --- a/diffxpy/api/stats.py +++ b/diffxpy/api/stats.py @@ -3,7 +3,7 @@ from diffxpy.stats.stats import wald_test from diffxpy.stats.stats import wald_test_chisq from diffxpy.stats.stats import two_coef_z_test -from diffxpy.stats.stats import wilcoxon_test +from diffxpy.stats.stats import mann_whitney_u_test from diffxpy.stats.stats import t_test_moments from diffxpy.stats.stats import t_test_raw diff --git a/diffxpy/api/test.py b/diffxpy/api/test.py index ba8c01f..a338070 100644 --- a/diffxpy/api/test.py +++ b/diffxpy/api/test.py @@ -2,7 +2,7 @@ from diffxpy.testing.base import lrt from diffxpy.testing.base import wald from diffxpy.testing.base import t_test -from diffxpy.testing.base import wilcoxon +from diffxpy.testing.base import rank_test from diffxpy.testing.base import partition from diffxpy.testing.base import pairwise from diffxpy.testing.base import versus_rest diff --git a/diffxpy/pkg_constants.py b/diffxpy/pkg_constants.py index 191d62d..8d78c99 100644 --- a/diffxpy/pkg_constants.py +++ b/diffxpy/pkg_constants.py @@ -1,7 +1,10 @@ -BATCHGLM_OPTIM_GD = True -BATCHGLM_OPTIM_ADAM = True +BATCHGLM_OPTIM_GD = False +BATCHGLM_OPTIM_ADAM = False BATCHGLM_OPTIM_ADAGRAD = False BATCHGLM_OPTIM_RMSPROP = False -BATCHGLM_OPTIM_NEWTON = True -BATCHGLM_OPTIM_IRLS = True +BATCHGLM_OPTIM_NEWTON = False +BATCHGLM_OPTIM_NEWTON_TR = True +BATCHGLM_OPTIM_IRLS = False +BATCHGLM_OPTIM_IRLS_TR = False +BATCHGLM_PROVIDE_BATCHED = False BATCHGLM_TERMINATION_TYPE = "by_feature" \ No newline at end of file diff --git a/diffxpy/stats/stats.py b/diffxpy/stats/stats.py index 6c1b6d2..d39a24f 100644 --- a/diffxpy/stats/stats.py +++ b/diffxpy/stats/stats.py @@ -1,7 +1,9 @@ +from typing import Union + import numpy as np import numpy.linalg import scipy.stats -from typing import Union +import xarray as xr def likelihood_ratio_test( @@ -37,7 +39,7 @@ def likelihood_ratio_test( return pvals -def wilcoxon_test( +def mann_whitney_u_test( x0: np.ndarray, x1: np.ndarray, ): @@ -68,7 +70,8 @@ def wilcoxon_test( scipy.stats.mannwhitneyu( x=x0[:, i].flatten(), y=x1[:, i].flatten(), - alternative='two-sided' + use_continuity=True, + alternative="two-sided" ).pvalue for i in range(x0.shape[1]) ]) return pvals @@ -152,7 +155,7 @@ def t_test_moments( out=s_delta ) - t_statistic = np.abs((mu0 - mu1) / s_delta) + t_statistic = (mu0 - mu1) / s_delta divisor = ( (np.square(var0 / n0) / (n0 - 1)) + @@ -174,7 +177,7 @@ def t_test_moments( out=df ) - pval = 2 * (1 - scipy.stats.t(df).cdf(t_statistic)) + pval = 2 * scipy.stats.t.sf(np.abs(t_statistic), df) return pval @@ -261,6 +264,9 @@ def wald_test_chisq( raise ValueError('stats.wald_test(): theta_mle and theta0 have to contain the same number of entries') theta_diff = theta_mle - theta0 + # Convert to nd.array to avoid gufunc error. + if isinstance(theta_diff, xr.DataArray): + theta_diff = theta_diff.values wald_statistic = np.array([ np.matmul( np.matmul( diff --git a/diffxpy/testing/base.py b/diffxpy/testing/base.py index 76bd900..723c208 100644 --- a/diffxpy/testing/base.py +++ b/diffxpy/testing/base.py @@ -5,6 +5,7 @@ import warnings import numpy as np +import scipy.sparse import xarray as xr import patsy try: @@ -13,6 +14,7 @@ anndata = None import batchglm.data as data_utils +from batchglm.xarray_sparse import SparseXArrayDataArray, SparseXArrayDataSet from batchglm.models.glm_nb import Model as GeneralizedLinearModel from ..stats import stats @@ -167,7 +169,10 @@ def _correction(self, method) -> np.ndarray: :param method: Multiple testing correction method. Browse available methods in the annotation of statsmodels.stats.multitest.multipletests(). """ - return correction.correct(pvals=self.pval, method=method) + if np.all(np.isnan(self.pval)): + return self.pval + else: + return correction.correct(pvals=self.pval, method=method) def _ave(self): """ @@ -186,7 +191,9 @@ def log_likelihood(self): @property def mean(self): if self._mean is None: - self._mean = self._ave().compute() + self._mean = self._ave() + if isinstance(self._mean, xr.DataArray): # Could also be np.ndarray coming out of XArraySparseDataArray + self._mean = self._mean.compute() return self._mean @property @@ -269,8 +276,12 @@ def plot_volcano( alpha=0.05, min_fc=1, size=20, - show=True, - save=None + highlight_ids: List = [], + highlight_size: float = 30, + highlight_col: str = "red", + show: bool = True, + save: Union[str, None] = None, + suffix: str = "_volcano.png" ): """ Returns a volcano plot of p-value vs. log fold change @@ -285,10 +296,13 @@ def plot_volcano( :param min_fc: Fold-change lower bound for visualization, the points below the threshold are colored in grey. :param size: Size of points. + :param highlight_ids: Genes to highlight in volcano plot. + :param highlight_ids: Size of points of genes to highlight in volcano plot. + :param highlight_ids: Color of points of genes to highlight in volcano plot. + :param show: Whether (if save is not None) and where (save indicates dir and file stem) to display plot. :param save: Path+file name stem to save plots to. - File will be save+"_volcano.png". Does not save if save is None. - :param show: Whether to display plot. - + File will be save+suffix. Does not save if save is None. + :param suffix: Suffix for file name to save plot to. Also use this to set the file type. :return: Tuple of matplotlib (figure, axis) """ @@ -322,6 +336,27 @@ def plot_volcano( legend=False, s=size, palette={True: "orange", False: "black"}) + highlight_ids_found = np.array([x in self.gene_ids for x in highlight_ids]) + highlight_ids_clean = [highlight_ids[i] for i in np.where(highlight_ids_found == True)[0]] + highlight_ids_not_found = [highlight_ids[i] for i in np.where(highlight_ids_found == False)[0]] + if len(highlight_ids_not_found) > 0: + logger.warning("not all highlight_ids were found in data set: ", ", ".join(highlight_ids_not_found)) + + if len(highlight_ids_clean) > 0: + neg_log_pvals_highlights = np.zeros([len(highlight_ids_clean)]) + logfc_highlights = np.zeros([len(highlight_ids_clean)]) + is_highlight = np.zeros([len(highlight_ids_clean)]) + for i,id in enumerate(highlight_ids_clean): + idx = np.where(self.gene_ids == id)[0] + neg_log_pvals_highlights[i] = neg_log_pvals[idx] + logfc_highlights[i] = logfc[idx] + + sns.scatterplot(y=neg_log_pvals_highlights, x=logfc_highlights, + hue=is_highlight, ax=ax, + legend=False, s=highlight_size, + palette={0: highlight_col}) + + if corrected_pval == True: ax.set(xlabel="log2FC", ylabel='-log10(corrected p-value)') else: @@ -329,7 +364,102 @@ def plot_volcano( # Save, show and return figure. if save is not None: - plt.savefig(save + '_volcano.png') + plt.savefig(save + suffix) + + if show: + plt.show() + + plt.close(fig) + + return ax + + def plot_ma( + self, + corrected_pval=True, + log2_fc_threshold=10, + alpha=0.05, + size=20, + highlight_ids: List = [], + highlight_size: float = 30, + highlight_col: str = "red", + show: bool = True, + save: Union[str, None] = None, + suffix: str = "_my_plot.png" + ): + """ + Returns an MA plot of mean expression vs. log fold change with significance + super-imposed. + + :param corrected_pval: Whether to use multiple testing corrected + or raw p-values. + :param log2_fc_threshold: Negative lower and upper bound of + log2 fold change displayed in plot. + :param alpha: p/q-value lower bound at which a test is considered + non-significant. The corresponding points are colored in grey. + :param size: Size of points. + :param highlight_ids: Genes to highlight in volcano plot. + :param highlight_ids: Size of points of genes to highlight in volcano plot. + :param highlight_ids: Color of points of genes to highlight in volcano plot. + :param show: Whether (if save is not None) and where (save indicates dir and file stem) to display plot. + :param save: Path+file name stem to save plots to. + File will be save+suffix. Does not save if save is None. + :param suffix: Suffix for file name to save plot to. Also use this to set the file type. + + + :return: Tuple of matplotlib (figure, axis) + """ + import seaborn as sns + import matplotlib.pyplot as plt + from matplotlib import gridspec + from matplotlib import rcParams + + plt.ioff() + + ave = np.log(self.mean + 1e-08) + + logfc = np.reshape(self.log2_fold_change(), -1) + # Clipping throws errors if not performed in actual data format (ndarray or DataArray): + if isinstance(logfc, xr.DataArray): + logfc = logfc.clip(-log2_fc_threshold, log2_fc_threshold) + else: + logfc = np.clip(logfc, -log2_fc_threshold, log2_fc_threshold, logfc) + + fig, ax = plt.subplots() + + if corrected_pval: + is_significant = self.pval < alpha + else: + is_significant = self.qval < alpha + + sns.scatterplot(y=logfc, x=ave, hue=is_significant, ax=ax, + legend=False, s=size, + palette={True: "orange", False: "black"}) + + highlight_ids_found = np.array([x in self.gene_ids for x in highlight_ids]) + highlight_ids_clean = [highlight_ids[i] for i in np.where(highlight_ids_found == True)[0]] + highlight_ids_not_found = [highlight_ids[i] for i in np.where(highlight_ids_found == False)[0]] + if len(highlight_ids_not_found) > 0: + logger.warning("not all highlight_ids were found in data set: ", ", ".join(highlight_ids_not_found)) + + if len(highlight_ids_clean) > 0: + ave_highlights = np.zeros([len(highlight_ids_clean)]) + logfc_highlights = np.zeros([len(highlight_ids_clean)]) + is_highlight = np.zeros([len(highlight_ids_clean)]) + for i,id in enumerate(highlight_ids_clean): + idx = np.where(self.gene_ids == id)[0] + ave_highlights[i] = ave[idx] + logfc_highlights[i] = logfc[idx] + + sns.scatterplot(y=logfc_highlights, x=ave_highlights, + hue=is_highlight, ax=ax, + legend=False, s=highlight_size, + palette={0: highlight_col}) + + ax.set(xlabel="log2FC", ylabel='log mean expression') + + # Save, show and return figure. + if save is not None: + plt.savefig(save + suffix) if show: plt.show() @@ -386,9 +516,9 @@ class DifferentialExpressionTestLRT(_DifferentialExpressionTestSingle): """ sample_description: pd.DataFrame - full_design_info: patsy.design_info + full_design_loc_info: patsy.design_info full_estim: _Estimation - reduced_design_info: patsy.design_info + reduced_design_loc_info: patsy.design_info reduced_estim: _Estimation def __init__( @@ -401,9 +531,9 @@ def __init__( ): super().__init__() self.sample_description = sample_description - self.full_design_info = full_design_loc_info + self.full_design_loc_info = full_design_loc_info self.full_estim = full_estim - self.reduced_design_info = reduced_design_loc_info + self.reduced_design_loc_info = reduced_design_loc_info self.reduced_estim = reduced_estim @property @@ -457,7 +587,7 @@ def _log_fold_change(self, factors: Union[Dict, Tuple, Set, List], base=np.e): if not isinstance(factors, set): factors = set(factors) - di = self.full_design_info + di = self.full_design_loc_info sample_description = self.sample_description[[f.name() for f in di.subset(factors).factor_infos]] dmat = self.full_estim.design_loc @@ -478,7 +608,7 @@ def _log_fold_change(self, factors: Union[Dict, Tuple, Set, List], base=np.e): # make the design matrix + sample description unique again dmat, sample_description = _dmat_unique(dmat, sample_description) - locations = self.full_estim.inverse_link_loc(dmat @ self.full_estim.par_link_loc) + locations = self.full_estim.inverse_link_loc(dmat.dot(self.full_estim.par_link_loc)) locations = np.log(locations) / np.log(base) dist = np.expand_dims(locations, axis=0) @@ -515,7 +645,7 @@ def log_fold_change(self, base=np.e, return_type="vector"): :return: either pandas.DataFrame or xarray.DataArray """ - factors = set(self.full_design_info.term_names).difference(self.reduced_design_info.term_names) + factors = set(self.full_design_loc_info.term_names) - set(self.reduced_design_loc_info.term_names) if return_type == "dataframe": dists = self._log_fold_change(factors=factors, base=base) @@ -540,13 +670,13 @@ def locations(self): :return: pd.DataFrame """ - di = self.full_design_info + di = self.full_design_loc_info sample_description = self.sample_description[[f.name() for f in di.factor_infos]] dmat = self.full_estim.design_loc dmat, sample_description = _dmat_unique(dmat, sample_description) - retval = self.full_estim.inverse_link_loc(dmat @ self.full_estim.par_link_loc) + retval = self.full_estim.inverse_link_loc(dmat.dot(self.full_estim.par_link_loc)) retval = pd.DataFrame(retval, columns=self.full_estim.features) for col in sample_description: retval[col] = sample_description[col] @@ -562,13 +692,13 @@ def scales(self): :return: pd.DataFrame """ - di = self.full_design_info + di = self.full_design_loc_info sample_description = self.sample_description[[f.name() for f in di.factor_infos]] dmat = self.full_estim.design_scale dmat, sample_description = _dmat_unique(dmat, sample_description) - retval = self.full_estim.inverse_link_scale(dmat @ self.full_estim.par_link_scale) + retval = self.full_estim.inverse_link_scale(dmat.doc(self.full_estim.par_link_scale)) retval = pd.DataFrame(retval, columns=self.full_estim.features) for col in sample_description: retval[col] = sample_description[col] @@ -662,11 +792,11 @@ def log_fold_change(self, base=np.e, **kwargs): # loc = dmat @ self.model_estim.par_link_loc[self.coef_loc_totest] # return loc[1] - loc[0] if len(self.coef_loc_totest) == 1: - return self.model_estim.par_link_loc[self.coef_loc_totest][0] + return self.model_estim.a_var[self.coef_loc_totest][0] else: - idx_max = np.argmax(np.abs(self.model_estim.par_link_loc[self.coef_loc_totest]), axis=0) - return self.model_estim.par_link_loc[self.coef_loc_totest][ - idx_max, np.arange(self.model_estim.par_link_loc.shape[1])] + idx_max = np.argmax(np.abs(self.model_estim.a_var[self.coef_loc_totest]), axis=0) + return self.model_estim.a_var[self.coef_loc_totest][ + idx_max, np.arange(self.model_estim.a_var.shape[1])] def _ll(self): """ @@ -682,7 +812,7 @@ def _ave(self): :return: xr.DataArray """ - return np.mean(self.X, axis=0) + return self.X.mean(axis=0) def _test(self): """ @@ -693,7 +823,7 @@ def _test(self): # Check whether single- or multiple parameters are tested. # For a single parameter, the wald statistic distribution is approximated # with a normal distribution, for multiple parameters, a chi-square distribution is used. - self.theta_mle = self.model_estim.par_link_loc[self.coef_loc_totest] + self.theta_mle = self.model_estim.a_var[self.coef_loc_totest] if len(self.coef_loc_totest) == 1: self.theta_mle = self.theta_mle[0] # Make xarray one dimensional for stats.wald_test. self.theta_sd = self.model_estim.fisher_inv[:, self.coef_loc_totest[0], self.coef_loc_totest[0]].values @@ -780,43 +910,52 @@ class DifferentialExpressionTestTT(_DifferentialExpressionTestSingle): Single t-test test per gene. """ - def __init__(self, data, grouping, gene_ids): + def __init__(self, data, grouping, gene_names): super().__init__() self._X = data self.grouping = grouping - self._gene_ids = np.asarray(gene_ids) + self._gene_names = np.asarray(gene_names) x0, x1 = _split_X(data, grouping) # Only compute p-values for genes with non-zero observations and non-zero group-wise variance. - self._mean = np.mean(data, axis=0) + mean_x0 = x0.mean(axis=0) + mean_x1 = x1.mean(axis=0) + mean_x0 = mean_x0.clip(np.nextafter(0, 1), np.inf) + mean_x1 = mean_x1.clip(np.nextafter(0, 1), np.inf) + # TODO: do not need mean again + self._mean = data.mean(axis=0) self._ave_geq_zero = np.asarray(self.mean).flatten() > 0 + var_x0 = np.asarray(x0.var(axis=0)).flatten() + var_x1 = np.asarray(x1.var(axis=0)).flatten() self._var_geq_zero = np.logical_or( - np.asarray(np.var(x0, axis=0)).flatten() > 0, - np.asarray(np.var(x1, axis=0)).flatten() > 0 + var_x0 > 0, + var_x1 > 0 + ) + idx_run = np.where(np.logical_and(self._ave_geq_zero == True, self._var_geq_zero == True))[0] + pval = np.zeros([data.shape[1]]) + np.nan + pval[idx_run] = stats.t_test_moments( + mu0=mean_x0[idx_run], + mu1=mean_x1[idx_run], + var0=var_x0[idx_run], + var1=var_x1[idx_run], + n0=idx_run.shape[0], + n1=idx_run.shape[0] ) - idx_tt = np.where(np.logical_and(self._ave_geq_zero == True, self._var_geq_zero == True))[0] - pval = np.zeros([self._gene_ids.shape[0]]) + np.nan - pval[idx_tt] = stats.t_test_raw(x0=x0[:, idx_tt], x1=x1[:, idx_tt]) self._pval = pval - mean_x0 = np.mean(x0, axis=0) - mean_x0 = mean_x0.clip(np.nextafter(0, 1), np.inf) - mean_x1 = np.mean(x1, axis=0) - mean_x1 = mean_x1.clip(np.nextafter(0, 1), np.inf) - self._logfc = np.log(mean_x1) - np.log(mean_x0).data # Return 0 if LFC was non-zero and variances are zero, # this causes division by zero in the test statistic. This # is a highly significant result if one believes the variance estimate. - pval[np.logical_and(np.logical_and(self._var_geq_zero == False, - self._ave_geq_zero == True), - self._logfc != 0)] = 0 + pval[np.where(np.logical_and(np.logical_and(self._var_geq_zero == False, + self._ave_geq_zero == True), + self._logfc != 0))] = 0 q = self.qval @property def gene_ids(self) -> np.ndarray: - return self._gene_ids + return self._gene_names @property def X(self): @@ -852,9 +991,9 @@ def summary(self, qval_thres=None, fc_upper_thres=None, return res -class DifferentialExpressionTestWilcoxon(_DifferentialExpressionTestSingle): +class DifferentialExpressionTestRank(_DifferentialExpressionTestSingle): """ - Single wilcoxon rank sum test per gene. + Single rank test per gene (Mann-Whitney U test). """ def __init__(self, data, grouping, gene_names): @@ -865,9 +1004,36 @@ def __init__(self, data, grouping, gene_names): x0, x1 = _split_X(data, grouping) - self._mean = np.mean(data, axis=0) - self._pval = stats.wilcoxon_test(x0=x0.data, x1=x1.data) - self._logfc = np.log(np.mean(x1, axis=0)) - np.log(np.mean(x0, axis=0)).data + mean_x0 = x0.mean(axis=0) + mean_x1 = x1.mean(axis=0) + mean_x0 = mean_x0.clip(np.nextafter(0, 1), np.inf) + mean_x1 = mean_x1.clip(np.nextafter(0, 1), np.inf) + # TODO unnecessary mean computation + self._mean = data.mean(axis=0) + var_x0 = np.asarray(x0.var(axis=0)).flatten() + var_x1 = np.asarray(x1.var(axis=0)).flatten() + self._var_geq_zero = np.logical_or( + var_x0 > 0, + var_x1 > 0 + ) + idx_run = np.where(np.logical_and(self._mean > 0, self._var_geq_zero == True))[0] + + # TODO: can this be done on sparse? + pval = np.zeros([data.shape[1]]) + np.nan + if isinstance(x0, xr.DataArray): + pval[idx_run] = stats.mann_whitney_u_test( + x0=x0.data[:,idx_run], + x1=x1.data[:,idx_run] + ) + else: + pval[idx_run] = stats.mann_whitney_u_test( + x0=np.asarray(x0.X[:,idx_run].todense()), + x1=np.asarray(x1.X[:,idx_run].todense()) + ) + + self._pval = pval + + self._logfc = np.log(mean_x1) - np.log(mean_x0).data q = self.qval @property @@ -921,7 +1087,7 @@ def plot_vs_ttest(self): sns.scatterplot(x=ttest_pvals, y=self.pval, ax=ax) - ax.set(xlabel="t-test", ylabel='wilcoxon test') + ax.set(xlabel="t-test", ylabel='rank test') return fig, ax @@ -2029,13 +2195,13 @@ def _continuous_model(self, idx, nonnumeric=False): """ idx = np.asarray(idx) if nonnumeric: - mu = np.matmul(self._model_estim.design_loc, + mu = np.matmul(self._model_estim.design_loc.values, self._model_estim.par_link_loc[:,idx]) if self._size_factors is not None: mu = mu + self._size_factors else: idx_basis = self._spline_par_loc_idx(intercept=True) - mu = np.matmul(self._model_estim.design_loc[:,idx_basis], + mu = np.matmul(self._model_estim.design_loc[:,idx_basis].values, self._model_estim.par_link_loc[idx_basis, idx]) mu = np.exp(mu) @@ -2322,7 +2488,7 @@ def __init__(self, def _parse_gene_names(data, gene_names): if gene_names is None: - if anndata is not None and isinstance(data, anndata.AnnData): + if anndata is not None and (isinstance(data, anndata.AnnData) or isinstance(data, anndata.base.Raw)): gene_names = data.var_names elif isinstance(data, xr.DataArray): gene_names = data["features"] @@ -2358,8 +2524,14 @@ def _parse_sample_description(data, sample_description=None) -> pd.DataFrame: "Please specify `sample_description` or provide `data` as xarray.Dataset or anndata.AnnData " + "with corresponding sample annotations" ) - assert data.shape[0] == sample_description.shape[ - 0], "data matrix and sample description must contain same number of cells" + + if anndata is not None and isinstance(data, anndata.base.Raw): + # anndata.base.Raw does not have attribute shape. + assert data.X.shape[0] == sample_description.shape[0], \ + "data matrix and sample description must contain same number of cells" + else: + assert data.shape[0] == sample_description.shape[0], \ + "data matrix and sample description must contain same number of cells" return sample_description @@ -2439,7 +2611,6 @@ def _fit( init_model=None, init_a: Union[np.ndarray, str] = "AUTO", init_b: Union[np.ndarray, str] = "AUTO", - as_numeric: Union[np.ndarray, list, Tuple] = [], gene_names=None, size_factors=None, batch_size: int = None, @@ -2535,9 +2706,10 @@ def _fit( "adagrad": pkg_constants.BATCHGLM_OPTIM_ADAGRAD, "rmsprop": pkg_constants.BATCHGLM_OPTIM_RMSPROP, "nr": pkg_constants.BATCHGLM_OPTIM_NEWTON, - "irls": pkg_constants.BATCHGLM_OPTIM_IRLS + "nr_tr": pkg_constants.BATCHGLM_OPTIM_NEWTON_TR, + "irls": pkg_constants.BATCHGLM_OPTIM_IRLS, + "irls_tr": pkg_constants.BATCHGLM_OPTIM_IRLS_TR } - termination_type = pkg_constants.BATCHGLM_TERMINATION_TYPE if isinstance(training_strategy, str) and training_strategy.lower() == 'bfgs': lib_size = np.zeros(data.shape[0]) @@ -2578,7 +2750,8 @@ def _fit( init_a=init_a, init_b=init_b, provide_optimizers=provide_optimizers, - termination_type=termination_type, + provide_batched=pkg_constants.BATCHGLM_PROVIDE_BATCHED, + termination_type=pkg_constants.BATCHGLM_TERMINATION_TYPE, dtype=dtype, **constructor_args ) @@ -2605,9 +2778,9 @@ def _fit( def lrt( - data: Union[anndata.AnnData, xr.DataArray, xr.Dataset, np.ndarray], - reduced_formula_loc: str = None, - full_formula_loc: str = None, + data: Union[anndata.AnnData, anndata.base.Raw, xr.DataArray, xr.Dataset, np.ndarray, scipy.sparse.csr_matrix], + reduced_formula_loc: str, + full_formula_loc: str, reduced_formula_scale: str = "~1", full_formula_scale: str = "~1", as_numeric: Union[List[str], Tuple[str], str] = (), @@ -2618,7 +2791,7 @@ def lrt( noise_model="nb", size_factors: np.ndarray = None, batch_size: int = None, - training_strategy: Union[str, List[Dict[str, object]], Callable] = "AUTO", + training_strategy: Union[str, List[Dict[str, object]], Callable] = "DEFAULT", quick_scale: bool = False, dtype="float64", **kwargs @@ -2703,6 +2876,7 @@ def lrt( Should be "float32" for single precision or "float64" for double precision. :param kwargs: [Debugging] Additional arguments will be passed to the _fit method. """ + # TODO test nestedness if len(kwargs) != 0: logger.info("additional kwargs: %s", str(kwargs)) @@ -2744,7 +2918,6 @@ def lrt( constraints_scale=None, init_a=init_a, init_b=init_b, - as_numeric=as_numeric, gene_names=gene_names, size_factors=size_factors, batch_size=batch_size, @@ -2763,7 +2936,6 @@ def lrt( gene_names=gene_names, init_a="init_model", init_b="init_model", - as_numeric=as_numeric, init_model=reduced_model, size_factors=size_factors, batch_size=batch_size, @@ -2785,7 +2957,7 @@ def lrt( def wald( - data: Union[anndata.AnnData, xr.DataArray, xr.Dataset, np.ndarray], + data: Union[anndata.AnnData, anndata.base.Raw, xr.DataArray, xr.Dataset, np.ndarray, scipy.sparse.csr_matrix], factor_loc_totest: Union[str, List[str]] = None, coef_to_test: Union[str, List[str]] = None, formula_loc: str = None, @@ -3005,7 +3177,6 @@ def wald( constraints_scale=constraints_scale, init_a=init_a, init_b=init_b, - as_numeric=as_numeric, gene_names=gene_names, size_factors=size_factors, batch_size=batch_size, @@ -3033,13 +3204,13 @@ def _parse_grouping(data, sample_description, grouping): def _split_X(data, grouping): groups = np.unique(grouping) - x0 = data[grouping == groups[0], :] - x1 = data[grouping == groups[1], :] + x0 = data[np.where(grouping == groups[0])[0]] + x1 = data[np.where(grouping == groups[1])[0]] return x0, x1 def t_test( - data: Union[anndata.AnnData, xr.DataArray, xr.Dataset, np.ndarray], + data: Union[anndata.AnnData, anndata.base.Raw, xr.DataArray, xr.Dataset, np.ndarray, scipy.sparse.csr_matrix], grouping, gene_names=None, sample_description=None, @@ -3059,27 +3230,29 @@ def t_test( :param sample_description: optional pandas.DataFrame containing sample annotations """ gene_names = _parse_gene_names(data, gene_names) - X: xr.DataArray = _parse_data(data, gene_names) + X = _parse_data(data, gene_names) + if isinstance(X, SparseXArrayDataSet): + X = X.X grouping = _parse_grouping(data, sample_description, grouping) de_test = DifferentialExpressionTestTT( data=X.astype(dtype), grouping=grouping, - gene_ids=gene_names, + gene_names=gene_names, ) return de_test -def wilcoxon( - data, +def rank_test( + data: Union[anndata.AnnData, anndata.base.Raw, xr.DataArray, xr.Dataset, np.ndarray, scipy.sparse.csr_matrix], grouping, gene_names=None, sample_description=None, dtype="float32" ): """ - Perform Wilcoxon rank sum test for differential expression + Perform Mann-Whitney rank test (Wilcoxon rank-sum test) for differential expression between two groups on adata object for each gene. :param data: Array-like, xr.DataArray, xr.Dataset or anndata.Anndata object containing observations. @@ -3092,10 +3265,12 @@ def wilcoxon( :param sample_description: optional pandas.DataFrame containing sample annotations """ gene_names = _parse_gene_names(data, gene_names) - X: xr.DataArray = _parse_data(data, gene_names) + X = _parse_data(data, gene_names) + if isinstance(X, SparseXArrayDataSet): + X = X.X grouping = _parse_grouping(data, sample_description, grouping) - de_test = DifferentialExpressionTestWilcoxon( + de_test = DifferentialExpressionTestRank( data=X.astype(dtype), grouping=grouping, gene_names=gene_names, @@ -3105,7 +3280,7 @@ def wilcoxon( def two_sample( - data: Union[anndata.AnnData, xr.DataArray, xr.Dataset, np.ndarray], + data: Union[anndata.AnnData, anndata.base.Raw, xr.DataArray, xr.Dataset, np.ndarray, scipy.sparse.csr_matrix], grouping: Union[str, np.ndarray, list], as_numeric: Union[List[str], Tuple[str], str] = (), test=None, @@ -3275,7 +3450,7 @@ def two_sample( dtype=dtype ) elif test.lower() == 'wilcoxon': - de_test = wilcoxon( + de_test = rank_test( data=X, gene_names=gene_names, grouping=grouping, @@ -3288,7 +3463,7 @@ def two_sample( def pairwise( - data: Union[anndata.AnnData, xr.DataArray, xr.Dataset, np.ndarray], + data: Union[anndata.AnnData, anndata.base.Raw, xr.DataArray, xr.Dataset, np.ndarray, scipy.sparse.csr_matrix], grouping: Union[str, np.ndarray, list], as_numeric: Union[List[str], Tuple[str], str] = [], test: str = 'z-test', @@ -3506,7 +3681,7 @@ def pairwise( def versus_rest( - data: Union[anndata.AnnData, xr.DataArray, xr.Dataset, np.ndarray], + data: Union[anndata.AnnData, anndata.base.Raw, xr.DataArray, xr.Dataset, np.ndarray, scipy.sparse.csr_matrix], grouping: Union[str, np.ndarray, list], as_numeric: Union[List[str], Tuple[str], str] = (), test: str = 'wald', @@ -3671,7 +3846,7 @@ def versus_rest( def partition( - data: Union[anndata.AnnData, xr.DataArray, xr.Dataset, np.ndarray], + data: Union[anndata.AnnData, anndata.base.Raw, xr.DataArray, xr.Dataset, np.ndarray, scipy.sparse.csr_matrix], partition: Union[str, np.ndarray, list], gene_names: str = None, sample_description: pd.DataFrame = None): @@ -3846,7 +4021,7 @@ def wilcoxon( """ DETestsSingle = [] for i, idx in enumerate(self.partition_idx): - DETestsSingle.append(wilcoxon( + DETestsSingle.append(rank_test( data=self.X[idx, :], grouping=grouping, gene_names=self.gene_names, @@ -3927,7 +4102,6 @@ def lrt( full_formula_loc=full_formula_loc, reduced_formula_scale=reduced_formula_scale, full_formula_scale=full_formula_scale, - as_numeric=as_numeric, gene_names=self.gene_names, sample_description=self.sample_description.iloc[idx, :], noise_model=noise_model, @@ -4027,7 +4201,7 @@ def wald( def continuous_1d( - data: Union[anndata.AnnData, xr.DataArray, xr.Dataset, np.ndarray], + data: Union[anndata.AnnData, anndata.base.Raw, xr.DataArray, xr.Dataset, np.ndarray, scipy.sparse.csr_matrix], continuous: str, df: int = 5, factor_loc_totest: Union[str, List[str]] = None, @@ -4190,11 +4364,6 @@ def continuous_1d( # Note that the brackets around formula_term_continuous propagate the sum # across interaction terms. formula_term_continuous = '(' + formula_extension + ')' - if formula is not None: - formula_new = formula.split(continuous) - formula_new = formula_term_continuous.join(formula_new) - else: - formula_new = None if formula_loc is not None: formula_loc_new = formula_loc.split(continuous) diff --git a/diffxpy/unit_test/test_data_types.py b/diffxpy/unit_test/test_data_types.py index 3699c04..8381c3d 100644 --- a/diffxpy/unit_test/test_data_types.py +++ b/diffxpy/unit_test/test_data_types.py @@ -3,7 +3,6 @@ import numpy as np import pandas as pd -import scipy.stats as stats import scipy.sparse import anndata @@ -11,22 +10,53 @@ import diffxpy.api as de -class TestDataTypes(unittest.TestCase): +class TestDataTypesSingle(unittest.TestCase): - def test_sparse_anndata(self, n_cells: int = 2000, n_genes: int = 100): - """ - Test if de.wald() generates a uniform p-value distribution - if it is given data simulated based on the null model. Returns the p-value - of the two-side Kolmgorov-Smirnov test for equality of the observed - p-value distribution and a uniform distribution. + def _test_wald(self, data, sample_description, gene_names=None): + test = de.test.wald( + data=data, + factor_loc_totest="condition", + formula_loc="~ 1 + condition", + sample_description=sample_description, + gene_names=gene_names, + quick_scale=True, + training_strategy="DEFAULT", + dtype="float64" + ) + summary = test.summary() - :param n_cells: Number of cells to simulate (number of observations per test). - :param n_genes: Number of genes to simulate (number of tests). - """ - logging.getLogger("tensorflow").setLevel(logging.ERROR) - logging.getLogger("batchglm").setLevel(logging.WARNING) - logging.getLogger("diffxpy").setLevel(logging.WARNING) + def _test_lrt(self, data, sample_description, gene_names=None): + test = de.test.lrt( + data=data, + full_formula_loc="~ 1 + condition", + reduced_formula_loc="~ 1", + sample_description=sample_description, + gene_names=gene_names, + quick_scale=True, + training_strategy="DEFAULT", + dtype="float64" + ) + summary = test.summary() + def _test_t_test(self, data, sample_description, gene_names=None): + test = de.test.t_test( + data=data, + grouping="condition", + sample_description=sample_description, + gene_names=gene_names + ) + summary = test.summary() + + def _test_rank(self, data, sample_description, gene_names=None): + test = de.test.rank_test( + data=data, + grouping="condition", + sample_description=sample_description, + gene_names=gene_names + ) + summary = test.summary() + + def simulate(self, n_cells: int = 20, n_genes: int = 2): sim = Simulator(num_observations=n_cells, num_features=n_genes) sim.generate_sample_description(num_batches=0, num_conditions=0) sim.generate() @@ -34,28 +64,87 @@ def test_sparse_anndata(self, n_cells: int = 2000, n_genes: int = 100): random_sample_description = pd.DataFrame({ "condition": np.random.randint(2, size=sim.num_observations) }) + return sim.X ,random_sample_description - adata = anndata.AnnData(scipy.sparse.csr_matrix(sim.X.values)) - # X = adata.X - test = de.test.wald( - data=adata, - factor_loc_totest="condition", - formula="~ 1 + condition", - sample_description=random_sample_description, - quick_scale=True, - training_strategy="DEFAULT", - dtype="float64" - ) - summary = test.summary() + def _test_numpy(self, sparse): + data, sample_description = self.simulate() + gene_names = data.features + data = data.values + if sparse: + data = scipy.sparse.csr_matrix(data) + + self._test_wald(data=data, sample_description=sample_description, gene_names=gene_names) + #self._test_lrt(data=data, sample_description=sample_description, gene_names=gene_names) + self._test_t_test(data=data, sample_description=sample_description, gene_names=gene_names) + self._test_rank(data=data, sample_description=sample_description, gene_names=gene_names) + + def _test_xarray(self): + data, sample_description = self.simulate() + + self._test_wald(data=data, sample_description=sample_description) + #self._test_lrt(data=data, sample_description=sample_description) + self._test_t_test(data=data, sample_description=sample_description) + self._test_rank(data=data, sample_description=sample_description) + + def _test_anndata(self, sparse): + data, sample_description = self.simulate() + gene_names = [str(x) for x in data.features.values] + data = data.values + if sparse: + data = scipy.sparse.csr_matrix(data) + + data = anndata.AnnData(data) + data.var_names = gene_names + self._test_wald(data=data, sample_description=sample_description) + #self._test_lrt(data=data, sample_description=sample_description) + self._test_t_test(data=data, sample_description=sample_description) + self._test_rank(data=data, sample_description=sample_description) - # Compare p-value distribution under null model against uniform distribution. - pval_h0 = stats.kstest(test.pval, 'uniform').pvalue + def _test_anndata_raw(self, sparse): + data, sample_description = self.simulate() + gene_names = [str(x) for x in data.features.values] + data = data.values + if sparse: + data = scipy.sparse.csr_matrix(data) - logging.getLogger("diffxpy").info('KS-test pvalue for null model match of wald(): %f' % pval_h0) - assert pval_h0 > 0.05, "KS-Test failed: pval_h0 is <= 0.05!" + data = anndata.AnnData(data) + data.var_names = gene_names + data.raw = data + self._test_wald(data=data.raw, sample_description=sample_description) + #self._test_lrt(data=data.raw, sample_description=sample_description) + self._test_t_test(data=data, sample_description=sample_description) + self._test_rank(data=data, sample_description=sample_description) + + def test_numpy(self): + logging.getLogger("tensorflow").setLevel(logging.ERROR) + logging.getLogger("batchglm").setLevel(logging.WARNING) + logging.getLogger("diffxpy").setLevel(logging.WARNING) + + self._test_numpy(sparse=False) + self._test_numpy(sparse=True) return True + def test_xarray(self): + logging.getLogger("tensorflow").setLevel(logging.ERROR) + logging.getLogger("batchglm").setLevel(logging.WARNING) + logging.getLogger("diffxpy").setLevel(logging.WARNING) + + self._test_xarray() + + return True + + def test_anndata(self): + logging.getLogger("tensorflow").setLevel(logging.ERROR) + logging.getLogger("batchglm").setLevel(logging.WARNING) + logging.getLogger("diffxpy").setLevel(logging.WARNING) + + self._test_anndata(sparse=False) + self._test_anndata(sparse=True) + self._test_anndata_raw(sparse=False) + self._test_anndata_raw(sparse=True) + + return True if __name__ == '__main__': unittest.main() diff --git a/diffxpy/unit_test/test_single.py b/diffxpy/unit_test/test_single.py index cb3ca7a..048ff5b 100644 --- a/diffxpy/unit_test/test_single.py +++ b/diffxpy/unit_test/test_single.py @@ -36,7 +36,7 @@ def test_null_distribution_wald(self, n_cells: int = 2000, n_genes: int = 100): test = de.test.wald( data=sim.X, factor_loc_totest="condition", - formula="~ 1 + condition + batch", + formula_loc="~ 1 + condition + batch", sample_description=random_sample_description, batch_size=500, training_strategy="DEFAULT", @@ -77,7 +77,7 @@ def test_null_distribution_wald_multi(self, n_cells: int = 2000, n_genes: int = test = de.test.wald( data=sim.X, factor_loc_totest="condition", - formula="~ 1 + condition", + formula_loc="~ 1 + condition", sample_description=random_sample_description, training_strategy="DEFAULT", dtype="float64" @@ -194,7 +194,7 @@ def test_null_distribution_wilcoxon(self, n_cells: int = 2000, n_genes: int = 10 "condition": np.random.randint(2, size=sim.num_observations) }) - test = de.test.wilcoxon( + test = de.test.rank_test( data=sim.X, grouping="condition", sample_description=random_sample_description, @@ -253,9 +253,59 @@ def _eval(self, sim, test): return sim + def test_wilcoxon_de(self, n_cells: int = 2000, n_genes: int = 100): + """ + Test if de.test.t_test() generates a uniform p-value distribution + if it is given data simulated based on the null model. + + :param n_cells: Number of cells to simulate (number of observations per test). + :param n_genes: Number of genes to simulate (number of tests). + """ + logging.getLogger("tensorflow").setLevel(logging.ERROR) + logging.getLogger("batchglm").setLevel(logging.WARNING) + logging.getLogger("diffxpy").setLevel(logging.WARNING) + + sim = self._prepare_data(n_cells=n_cells, n_genes=n_genes) + + test = de.test.rank_test( + data=sim.X, + grouping="condition", + sample_description=sim.sample_description, + dtype="float64" + ) + + self._eval(sim=sim, test=test) + + return True + + def test_t_test_de(self, n_cells: int = 2000, n_genes: int = 100): + """ + Test if de.test.t_test() generates a uniform p-value distribution + if it is given data simulated based on the null model. + + :param n_cells: Number of cells to simulate (number of observations per test). + :param n_genes: Number of genes to simulate (number of tests). + """ + logging.getLogger("tensorflow").setLevel(logging.ERROR) + logging.getLogger("batchglm").setLevel(logging.WARNING) + logging.getLogger("diffxpy").setLevel(logging.WARNING) + + sim = self._prepare_data(n_cells=n_cells, n_genes=n_genes) + + test = de.test.t_test( + data=sim.X, + grouping="condition", + sample_description=sim.sample_description, + dtype="float64" + ) + + self._eval(sim=sim, test=test) + + return True + def test_wald_de(self, n_cells: int = 2000, n_genes: int = 100): """ - Test if de.lrt() generates a uniform p-value distribution + Test if de.test.wald() generates a uniform p-value distribution if it is given data simulated based on the null model. :param n_cells: Number of cells to simulate (number of observations per test). @@ -270,7 +320,7 @@ def test_wald_de(self, n_cells: int = 2000, n_genes: int = 100): test = de.test.wald( data=sim.X, factor_loc_totest="condition", - formula="~ 1 + condition", + formula_loc="~ 1 + condition", sample_description=sim.sample_description, training_strategy="DEFAULT", dtype="float64" @@ -282,7 +332,7 @@ def test_wald_de(self, n_cells: int = 2000, n_genes: int = 100): def test_lrt_de(self, n_cells: int = 2000, n_genes: int = 100): """ - Test if de.lrt() generates a uniform p-value distribution + Test if de.test.lrt() generates a uniform p-value distribution if it is given data simulated based on the null model. Returns the p-value of the two-side Kolmgorov-Smirnov test for equality of the observed p-value distribution and a uniform distribution. @@ -312,5 +362,110 @@ def test_lrt_de(self, n_cells: int = 2000, n_genes: int = 100): return True +class TestSingleExternal(unittest.TestCase): + + def _prepare_data(self, n_cells: int = 2000, n_genes: int = 100): + """ + + :param n_cells: Number of cells to simulate (number of observations per test). + :param n_genes: Number of genes to simulate (number of tests). + """ + sim = Simulator(num_observations=n_cells, num_features=n_genes) + sim.generate_sample_description(num_batches=0, num_conditions=2) + sim.generate_params( + rand_fn_ave=lambda shape: np.random.poisson(500, shape) + 1, + rand_fn=lambda shape: np.abs(np.random.uniform(1, 0.5, shape)) + ) + sim.generate_data() + + return sim + + def _eval(self, test, ref_pvals): + test_pval = test.pval + pval_dev = np.abs(test_pval - ref_pvals) + log_pval_dev = np.abs(np.log(test_pval+1e-200) - np.log(ref_pvals+1e-200)) + max_dev = np.max(pval_dev) + max_log_dev = np.max(log_pval_dev) + mean_dev = np.mean(log_pval_dev) + logging.getLogger("diffxpy").info( + 'maximum absolute p-value deviation: %f' % + float(max_dev) + ) + logging.getLogger("diffxpy").info( + 'maximum absolute log p-value deviation: %f' % + float(max_log_dev) + ) + logging.getLogger("diffxpy").info( + 'mean absolute log p-value deviation: %f' % + float(mean_dev) + ) + assert max_dev < 1e-3, "maximum deviation too large" + assert max_log_dev < 1e-1, "maximum deviation in log space too large" + + def test_t_test_ref(self, n_cells: int = 2000, n_genes: int = 100): + """ + Test if de.test.t_test() generates the same p-value distribution as scipy t-test. + + :param n_cells: Number of cells to simulate (number of observations per test). + :param n_genes: Number of genes to simulate (number of tests). + """ + logging.getLogger("tensorflow").setLevel(logging.ERROR) + logging.getLogger("batchglm").setLevel(logging.WARNING) + logging.getLogger("diffxpy").setLevel(logging.INFO) + + sim = self._prepare_data(n_cells=n_cells, n_genes=n_genes) + + test = de.test.t_test( + data=sim.X, + grouping="condition", + sample_description=sim.sample_description, + dtype="float64" + ) + + # Run scipy t-tests as a reference. + conds = np.unique(sim.sample_description["condition"].values) + ind_a = np.where(sim.sample_description["condition"] == conds[0])[0] + ind_b = np.where(sim.sample_description["condition"] == conds[1])[0] + scipy_pvals = stats.ttest_ind(a=sim.X[ind_a, :], b=sim.X[ind_b, :], axis=0, equal_var=False).pvalue + + self._eval(test=test, ref_pvals=scipy_pvals) + + return True + + def test_wilcoxon_ref(self, n_cells: int = 2000, n_genes: int = 100): + """ + Test if de.test.t_test() generates the same p-value distribution as scipy t-test. + + :param n_cells: Number of cells to simulate (number of observations per test). + :param n_genes: Number of genes to simulate (number of tests). + """ + logging.getLogger("tensorflow").setLevel(logging.ERROR) + logging.getLogger("batchglm").setLevel(logging.WARNING) + logging.getLogger("diffxpy").setLevel(logging.INFO) + + sim = self._prepare_data(n_cells=n_cells, n_genes=n_genes) + + test = de.test.rank_test( + data=sim.X, + grouping="condition", + sample_description=sim.sample_description, + dtype="float64" + ) + + # Run scipy t-tests as a reference. + conds = np.unique(sim.sample_description["condition"].values) + ind_a = np.where(sim.sample_description["condition"] == conds[0])[0] + ind_b = np.where(sim.sample_description["condition"] == conds[1])[0] + scipy_pvals = np.array([ + stats.mannwhitneyu(x=sim.X[ind_a, i], y=sim.X[ind_b, i], + use_continuity=True, alternative="two-sided").pvalue + for i in range(sim.X.shape[1]) + ]) + + self._eval(test=test, ref_pvals=scipy_pvals) + + return True + + if __name__ == '__main__': unittest.main() diff --git a/diffxpy/unit_test/test_stats.py b/diffxpy/unit_test/test_stats.py index f88779a..d517609 100644 --- a/diffxpy/unit_test/test_stats.py +++ b/diffxpy/unit_test/test_stats.py @@ -128,7 +128,7 @@ def test_wilcoxon(self, n: int = 1000, n_test: int = 100): x1 = np.vstack([np.random.normal(loc=locs[i], scale=scales[i], size=n_test) for i in range(n)]).T # Compute p-value distribution under null model. - pvals = de.stats.wilcoxon_test(x0=x0, x1=x1) + pvals = de.stats.mann_whitney_u_test(x0=x0, x1=x1) # Compare p-value distribution under null model against uniform distribution. pval_h0 = stats.kstest(pvals, 'uniform').pvalue