diff --git a/diffxpy/api/__init__.py b/diffxpy/api/__init__.py
index daaeb6a..a1c1dad 100644
--- a/diffxpy/api/__init__.py
+++ b/diffxpy/api/__init__.py
@@ -5,3 +5,4 @@
 from . import enrich
 from . import stats
 from . import utils
+from .. import pkg_constants
diff --git a/diffxpy/api/stats.py b/diffxpy/api/stats.py
index 8745f4a..4faaab0 100644
--- a/diffxpy/api/stats.py
+++ b/diffxpy/api/stats.py
@@ -3,7 +3,7 @@
 from diffxpy.stats.stats import wald_test
 from diffxpy.stats.stats import wald_test_chisq
 from diffxpy.stats.stats import two_coef_z_test
-from diffxpy.stats.stats import wilcoxon_test
+from diffxpy.stats.stats import mann_whitney_u_test
 from diffxpy.stats.stats import t_test_moments
 from diffxpy.stats.stats import t_test_raw
 
diff --git a/diffxpy/api/test.py b/diffxpy/api/test.py
index ba8c01f..a338070 100644
--- a/diffxpy/api/test.py
+++ b/diffxpy/api/test.py
@@ -2,7 +2,7 @@
 from diffxpy.testing.base import lrt
 from diffxpy.testing.base import wald
 from diffxpy.testing.base import t_test
-from diffxpy.testing.base import wilcoxon
+from diffxpy.testing.base import rank_test
 from diffxpy.testing.base import partition
 from diffxpy.testing.base import pairwise
 from diffxpy.testing.base import versus_rest
diff --git a/diffxpy/pkg_constants.py b/diffxpy/pkg_constants.py
index 191d62d..8d78c99 100644
--- a/diffxpy/pkg_constants.py
+++ b/diffxpy/pkg_constants.py
@@ -1,7 +1,10 @@
-BATCHGLM_OPTIM_GD = True
-BATCHGLM_OPTIM_ADAM = True
+BATCHGLM_OPTIM_GD = False
+BATCHGLM_OPTIM_ADAM = False
 BATCHGLM_OPTIM_ADAGRAD = False
 BATCHGLM_OPTIM_RMSPROP = False
-BATCHGLM_OPTIM_NEWTON = True
-BATCHGLM_OPTIM_IRLS = True
+BATCHGLM_OPTIM_NEWTON = False
+BATCHGLM_OPTIM_NEWTON_TR = True
+BATCHGLM_OPTIM_IRLS = False
+BATCHGLM_OPTIM_IRLS_TR = False
+BATCHGLM_PROVIDE_BATCHED = False
 BATCHGLM_TERMINATION_TYPE = "by_feature"
\ No newline at end of file
diff --git a/diffxpy/stats/stats.py b/diffxpy/stats/stats.py
index 6c1b6d2..d39a24f 100644
--- a/diffxpy/stats/stats.py
+++ b/diffxpy/stats/stats.py
@@ -1,7 +1,9 @@
+from typing import Union
+
 import numpy as np
 import numpy.linalg
 import scipy.stats
-from typing import Union
+import xarray as xr
 
 
 def likelihood_ratio_test(
@@ -37,7 +39,7 @@ def likelihood_ratio_test(
     return pvals
 
 
-def wilcoxon_test(
+def mann_whitney_u_test(
         x0: np.ndarray,
         x1: np.ndarray,
 ):
@@ -68,7 +70,8 @@ def wilcoxon_test(
         scipy.stats.mannwhitneyu(
             x=x0[:, i].flatten(),
             y=x1[:, i].flatten(),
-            alternative='two-sided'
+            use_continuity=True,
+            alternative="two-sided"
         ).pvalue for i in range(x0.shape[1])
     ])
     return pvals
@@ -152,7 +155,7 @@ def t_test_moments(
         out=s_delta
     )
 
-    t_statistic = np.abs((mu0 - mu1) / s_delta)
+    t_statistic = (mu0 - mu1) / s_delta
 
     divisor = (
             (np.square(var0 / n0) / (n0 - 1)) +
@@ -174,7 +177,7 @@ def t_test_moments(
         out=df
     )
 
-    pval = 2 * (1 - scipy.stats.t(df).cdf(t_statistic))
+    pval = 2 * scipy.stats.t.sf(np.abs(t_statistic), df)
     return pval
 
 
@@ -261,6 +264,9 @@ def wald_test_chisq(
             raise ValueError('stats.wald_test(): theta_mle and theta0 have to contain the same number of entries')
 
     theta_diff = theta_mle - theta0
+    # Convert to nd.array to avoid gufunc error.
+    if isinstance(theta_diff, xr.DataArray):
+        theta_diff = theta_diff.values
     wald_statistic = np.array([
         np.matmul(
             np.matmul(
diff --git a/diffxpy/testing/base.py b/diffxpy/testing/base.py
index 76bd900..723c208 100644
--- a/diffxpy/testing/base.py
+++ b/diffxpy/testing/base.py
@@ -5,6 +5,7 @@
 import warnings
 
 import numpy as np
+import scipy.sparse
 import xarray as xr
 import patsy
 try:
@@ -13,6 +14,7 @@
     anndata = None
 
 import batchglm.data as data_utils
+from batchglm.xarray_sparse import SparseXArrayDataArray, SparseXArrayDataSet
 from batchglm.models.glm_nb import Model as GeneralizedLinearModel
 
 from ..stats import stats
@@ -167,7 +169,10 @@ def _correction(self, method) -> np.ndarray:
         :param method: Multiple testing correction method.
             Browse available methods in the annotation of statsmodels.stats.multitest.multipletests().
         """
-        return correction.correct(pvals=self.pval, method=method)
+        if np.all(np.isnan(self.pval)):
+            return self.pval
+        else:
+            return correction.correct(pvals=self.pval, method=method)
 
     def _ave(self):
         """
@@ -186,7 +191,9 @@ def log_likelihood(self):
     @property
     def mean(self):
         if self._mean is None:
-            self._mean = self._ave().compute()
+            self._mean = self._ave()
+            if isinstance(self._mean, xr.DataArray):  # Could also be np.ndarray coming out of XArraySparseDataArray
+                self._mean = self._mean.compute()
         return self._mean
 
     @property
@@ -269,8 +276,12 @@ def plot_volcano(
             alpha=0.05,
             min_fc=1,
             size=20,
-            show=True,
-            save=None
+            highlight_ids: List = [],
+            highlight_size: float = 30,
+            highlight_col: str = "red",
+            show: bool = True,
+            save: Union[str, None] = None,
+            suffix: str = "_volcano.png"
     ):
         """
         Returns a volcano plot of p-value vs. log fold change
@@ -285,10 +296,13 @@ def plot_volcano(
         :param min_fc: Fold-change lower bound for visualization,
             the points below the threshold are colored in grey.
         :param size: Size of points.
+        :param highlight_ids: Genes to highlight in volcano plot.
+        :param highlight_ids: Size of points of genes to highlight in volcano plot.
+        :param highlight_ids: Color of points of genes to highlight in volcano plot.
+        :param show: Whether (if save is not None) and where (save indicates dir and file stem) to display plot.
         :param save: Path+file name stem to save plots to.
-            File will be save+"_volcano.png". Does not save if save is None.
-        :param show: Whether to display plot.
-
+            File will be save+suffix. Does not save if save is None.
+        :param suffix: Suffix for file name to save plot to. Also use this to set the file type.
 
         :return: Tuple of matplotlib (figure, axis)
         """
@@ -322,6 +336,27 @@ def plot_volcano(
                         legend=False, s=size,
                         palette={True: "orange", False: "black"})
 
+        highlight_ids_found = np.array([x in self.gene_ids for x in highlight_ids])
+        highlight_ids_clean = [highlight_ids[i] for i in np.where(highlight_ids_found == True)[0]]
+        highlight_ids_not_found = [highlight_ids[i] for i in np.where(highlight_ids_found == False)[0]]
+        if len(highlight_ids_not_found) > 0:
+            logger.warning("not all highlight_ids were found in data set: ", ", ".join(highlight_ids_not_found))
+
+        if len(highlight_ids_clean) > 0:
+            neg_log_pvals_highlights = np.zeros([len(highlight_ids_clean)])
+            logfc_highlights = np.zeros([len(highlight_ids_clean)])
+            is_highlight = np.zeros([len(highlight_ids_clean)])
+            for i,id in enumerate(highlight_ids_clean):
+                idx = np.where(self.gene_ids == id)[0]
+                neg_log_pvals_highlights[i] = neg_log_pvals[idx]
+                logfc_highlights[i] = logfc[idx]
+
+            sns.scatterplot(y=neg_log_pvals_highlights, x=logfc_highlights,
+                            hue=is_highlight, ax=ax,
+                            legend=False, s=highlight_size,
+                            palette={0: highlight_col})
+
+
         if corrected_pval == True:
             ax.set(xlabel="log2FC", ylabel='-log10(corrected p-value)')
         else:
@@ -329,7 +364,102 @@ def plot_volcano(
 
         # Save, show and return figure.
         if save is not None:
-            plt.savefig(save + '_volcano.png')
+            plt.savefig(save + suffix)
+
+        if show:
+            plt.show()
+
+        plt.close(fig)
+
+        return ax
+
+    def plot_ma(
+            self,
+            corrected_pval=True,
+            log2_fc_threshold=10,
+            alpha=0.05,
+            size=20,
+            highlight_ids: List = [],
+            highlight_size: float = 30,
+            highlight_col: str = "red",
+            show: bool = True,
+            save: Union[str, None] = None,
+            suffix: str = "_my_plot.png"
+    ):
+        """
+        Returns an MA plot of mean expression vs. log fold change with significance
+        super-imposed.
+
+        :param corrected_pval: Whether to use multiple testing corrected
+            or raw p-values.
+        :param log2_fc_threshold: Negative lower and upper bound of
+            log2 fold change displayed in plot.
+        :param alpha: p/q-value lower bound at which a test is considered
+            non-significant. The corresponding points are colored in grey.
+        :param size: Size of points.
+        :param highlight_ids: Genes to highlight in volcano plot.
+        :param highlight_ids: Size of points of genes to highlight in volcano plot.
+        :param highlight_ids: Color of points of genes to highlight in volcano plot.
+        :param show: Whether (if save is not None) and where (save indicates dir and file stem) to display plot.
+        :param save: Path+file name stem to save plots to.
+            File will be save+suffix. Does not save if save is None.
+        :param suffix: Suffix for file name to save plot to. Also use this to set the file type.
+
+
+        :return: Tuple of matplotlib (figure, axis)
+        """
+        import seaborn as sns
+        import matplotlib.pyplot as plt
+        from matplotlib import gridspec
+        from matplotlib import rcParams
+
+        plt.ioff()
+
+        ave = np.log(self.mean + 1e-08)
+
+        logfc = np.reshape(self.log2_fold_change(), -1)
+        # Clipping throws errors if not performed in actual data format (ndarray or DataArray):
+        if isinstance(logfc, xr.DataArray):
+            logfc = logfc.clip(-log2_fc_threshold, log2_fc_threshold)
+        else:
+            logfc = np.clip(logfc, -log2_fc_threshold, log2_fc_threshold, logfc)
+
+        fig, ax = plt.subplots()
+
+        if corrected_pval:
+            is_significant = self.pval < alpha
+        else:
+            is_significant = self.qval < alpha
+
+        sns.scatterplot(y=logfc, x=ave, hue=is_significant, ax=ax,
+                        legend=False, s=size,
+                        palette={True: "orange", False: "black"})
+
+        highlight_ids_found = np.array([x in self.gene_ids for x in highlight_ids])
+        highlight_ids_clean = [highlight_ids[i] for i in np.where(highlight_ids_found == True)[0]]
+        highlight_ids_not_found = [highlight_ids[i] for i in np.where(highlight_ids_found == False)[0]]
+        if len(highlight_ids_not_found) > 0:
+            logger.warning("not all highlight_ids were found in data set: ", ", ".join(highlight_ids_not_found))
+
+        if len(highlight_ids_clean) > 0:
+            ave_highlights = np.zeros([len(highlight_ids_clean)])
+            logfc_highlights = np.zeros([len(highlight_ids_clean)])
+            is_highlight = np.zeros([len(highlight_ids_clean)])
+            for i,id in enumerate(highlight_ids_clean):
+                idx = np.where(self.gene_ids == id)[0]
+                ave_highlights[i] = ave[idx]
+                logfc_highlights[i] = logfc[idx]
+
+            sns.scatterplot(y=logfc_highlights, x=ave_highlights,
+                            hue=is_highlight, ax=ax,
+                            legend=False, s=highlight_size,
+                            palette={0: highlight_col})
+
+        ax.set(xlabel="log2FC", ylabel='log mean expression')
+
+        # Save, show and return figure.
+        if save is not None:
+            plt.savefig(save + suffix)
 
         if show:
             plt.show()
@@ -386,9 +516,9 @@ class DifferentialExpressionTestLRT(_DifferentialExpressionTestSingle):
     """
 
     sample_description: pd.DataFrame
-    full_design_info: patsy.design_info
+    full_design_loc_info: patsy.design_info
     full_estim: _Estimation
-    reduced_design_info: patsy.design_info
+    reduced_design_loc_info: patsy.design_info
     reduced_estim: _Estimation
 
     def __init__(
@@ -401,9 +531,9 @@ def __init__(
     ):
         super().__init__()
         self.sample_description = sample_description
-        self.full_design_info = full_design_loc_info
+        self.full_design_loc_info = full_design_loc_info
         self.full_estim = full_estim
-        self.reduced_design_info = reduced_design_loc_info
+        self.reduced_design_loc_info = reduced_design_loc_info
         self.reduced_estim = reduced_estim
 
     @property
@@ -457,7 +587,7 @@ def _log_fold_change(self, factors: Union[Dict, Tuple, Set, List], base=np.e):
         if not isinstance(factors, set):
             factors = set(factors)
 
-        di = self.full_design_info
+        di = self.full_design_loc_info
         sample_description = self.sample_description[[f.name() for f in di.subset(factors).factor_infos]]
         dmat = self.full_estim.design_loc
 
@@ -478,7 +608,7 @@ def _log_fold_change(self, factors: Union[Dict, Tuple, Set, List], base=np.e):
         # make the design matrix + sample description unique again
         dmat, sample_description = _dmat_unique(dmat, sample_description)
 
-        locations = self.full_estim.inverse_link_loc(dmat @ self.full_estim.par_link_loc)
+        locations = self.full_estim.inverse_link_loc(dmat.dot(self.full_estim.par_link_loc))
         locations = np.log(locations) / np.log(base)
 
         dist = np.expand_dims(locations, axis=0)
@@ -515,7 +645,7 @@ def log_fold_change(self, base=np.e, return_type="vector"):
 
         :return: either pandas.DataFrame or xarray.DataArray
         """
-        factors = set(self.full_design_info.term_names).difference(self.reduced_design_info.term_names)
+        factors = set(self.full_design_loc_info.term_names) - set(self.reduced_design_loc_info.term_names)
 
         if return_type == "dataframe":
             dists = self._log_fold_change(factors=factors, base=base)
@@ -540,13 +670,13 @@ def locations(self):
         :return: pd.DataFrame
         """
 
-        di = self.full_design_info
+        di = self.full_design_loc_info
         sample_description = self.sample_description[[f.name() for f in di.factor_infos]]
         dmat = self.full_estim.design_loc
 
         dmat, sample_description = _dmat_unique(dmat, sample_description)
 
-        retval = self.full_estim.inverse_link_loc(dmat @ self.full_estim.par_link_loc)
+        retval = self.full_estim.inverse_link_loc(dmat.dot(self.full_estim.par_link_loc))
         retval = pd.DataFrame(retval, columns=self.full_estim.features)
         for col in sample_description:
             retval[col] = sample_description[col]
@@ -562,13 +692,13 @@ def scales(self):
         :return: pd.DataFrame
         """
 
-        di = self.full_design_info
+        di = self.full_design_loc_info
         sample_description = self.sample_description[[f.name() for f in di.factor_infos]]
         dmat = self.full_estim.design_scale
 
         dmat, sample_description = _dmat_unique(dmat, sample_description)
 
-        retval = self.full_estim.inverse_link_scale(dmat @ self.full_estim.par_link_scale)
+        retval = self.full_estim.inverse_link_scale(dmat.doc(self.full_estim.par_link_scale))
         retval = pd.DataFrame(retval, columns=self.full_estim.features)
         for col in sample_description:
             retval[col] = sample_description[col]
@@ -662,11 +792,11 @@ def log_fold_change(self, base=np.e, **kwargs):
         # loc = dmat @ self.model_estim.par_link_loc[self.coef_loc_totest]
         # return loc[1] - loc[0]
         if len(self.coef_loc_totest) == 1:
-            return self.model_estim.par_link_loc[self.coef_loc_totest][0]
+            return self.model_estim.a_var[self.coef_loc_totest][0]
         else:
-            idx_max = np.argmax(np.abs(self.model_estim.par_link_loc[self.coef_loc_totest]), axis=0)
-            return self.model_estim.par_link_loc[self.coef_loc_totest][
-                idx_max, np.arange(self.model_estim.par_link_loc.shape[1])]
+            idx_max = np.argmax(np.abs(self.model_estim.a_var[self.coef_loc_totest]), axis=0)
+            return self.model_estim.a_var[self.coef_loc_totest][
+                idx_max, np.arange(self.model_estim.a_var.shape[1])]
 
     def _ll(self):
         """
@@ -682,7 +812,7 @@ def _ave(self):
 
         :return: xr.DataArray
         """
-        return np.mean(self.X, axis=0)
+        return self.X.mean(axis=0)
 
     def _test(self):
         """
@@ -693,7 +823,7 @@ def _test(self):
         # Check whether single- or multiple parameters are tested.
         # For a single parameter, the wald statistic distribution is approximated
         # with a normal distribution, for multiple parameters, a chi-square distribution is used.
-        self.theta_mle = self.model_estim.par_link_loc[self.coef_loc_totest]
+        self.theta_mle = self.model_estim.a_var[self.coef_loc_totest]
         if len(self.coef_loc_totest) == 1:
             self.theta_mle = self.theta_mle[0]  # Make xarray one dimensional for stats.wald_test.
             self.theta_sd = self.model_estim.fisher_inv[:, self.coef_loc_totest[0], self.coef_loc_totest[0]].values
@@ -780,43 +910,52 @@ class DifferentialExpressionTestTT(_DifferentialExpressionTestSingle):
     Single t-test test per gene.
     """
 
-    def __init__(self, data, grouping, gene_ids):
+    def __init__(self, data, grouping, gene_names):
         super().__init__()
         self._X = data
         self.grouping = grouping
-        self._gene_ids = np.asarray(gene_ids)
+        self._gene_names = np.asarray(gene_names)
 
         x0, x1 = _split_X(data, grouping)
 
         # Only compute p-values for genes with non-zero observations and non-zero group-wise variance.
-        self._mean = np.mean(data, axis=0)
+        mean_x0 = x0.mean(axis=0)
+        mean_x1 = x1.mean(axis=0)
+        mean_x0 = mean_x0.clip(np.nextafter(0, 1), np.inf)
+        mean_x1 = mean_x1.clip(np.nextafter(0, 1), np.inf)
+        # TODO: do not need mean again
+        self._mean = data.mean(axis=0)
         self._ave_geq_zero = np.asarray(self.mean).flatten() > 0
+        var_x0 = np.asarray(x0.var(axis=0)).flatten()
+        var_x1 = np.asarray(x1.var(axis=0)).flatten()
         self._var_geq_zero = np.logical_or(
-            np.asarray(np.var(x0, axis=0)).flatten() > 0,
-            np.asarray(np.var(x1, axis=0)).flatten() > 0
+            var_x0 > 0,
+            var_x1 > 0
+        )
+        idx_run = np.where(np.logical_and(self._ave_geq_zero == True, self._var_geq_zero == True))[0]
+        pval = np.zeros([data.shape[1]]) + np.nan
+        pval[idx_run] = stats.t_test_moments(
+            mu0=mean_x0[idx_run],
+            mu1=mean_x1[idx_run],
+            var0=var_x0[idx_run],
+            var1=var_x1[idx_run],
+            n0=idx_run.shape[0],
+            n1=idx_run.shape[0]
         )
-        idx_tt = np.where(np.logical_and(self._ave_geq_zero == True, self._var_geq_zero == True))[0]
-        pval = np.zeros([self._gene_ids.shape[0]]) + np.nan
-        pval[idx_tt] = stats.t_test_raw(x0=x0[:, idx_tt], x1=x1[:, idx_tt])
         self._pval = pval
 
-        mean_x0 = np.mean(x0, axis=0)
-        mean_x0 = mean_x0.clip(np.nextafter(0, 1), np.inf)
-        mean_x1 = np.mean(x1, axis=0)
-        mean_x1 = mean_x1.clip(np.nextafter(0, 1), np.inf)
-
         self._logfc = np.log(mean_x1) - np.log(mean_x0).data
         # Return 0 if LFC was non-zero and variances are zero,
         # this causes division by zero in the test statistic. This
         # is a highly significant result if one believes the variance estimate.
-        pval[np.logical_and(np.logical_and(self._var_geq_zero == False,
-                                           self._ave_geq_zero == True),
-                            self._logfc != 0)] = 0
+        pval[np.where(np.logical_and(np.logical_and(self._var_geq_zero == False,
+                                                    self._ave_geq_zero == True),
+                                     self._logfc != 0))] = 0
         q = self.qval
 
     @property
     def gene_ids(self) -> np.ndarray:
-        return self._gene_ids
+        return self._gene_names
 
     @property
     def X(self):
@@ -852,9 +991,9 @@ def summary(self, qval_thres=None, fc_upper_thres=None,
         return res
 
 
-class DifferentialExpressionTestWilcoxon(_DifferentialExpressionTestSingle):
+class DifferentialExpressionTestRank(_DifferentialExpressionTestSingle):
     """
-    Single wilcoxon rank sum test per gene.
+    Single rank test per gene (Mann-Whitney U test).
     """
 
     def __init__(self, data, grouping, gene_names):
@@ -865,9 +1004,36 @@ def __init__(self, data, grouping, gene_names):
 
         x0, x1 = _split_X(data, grouping)
 
-        self._mean = np.mean(data, axis=0)
-        self._pval = stats.wilcoxon_test(x0=x0.data, x1=x1.data)
-        self._logfc = np.log(np.mean(x1, axis=0)) - np.log(np.mean(x0, axis=0)).data
+        mean_x0 = x0.mean(axis=0)
+        mean_x1 = x1.mean(axis=0)
+        mean_x0 = mean_x0.clip(np.nextafter(0, 1), np.inf)
+        mean_x1 = mean_x1.clip(np.nextafter(0, 1), np.inf)
+        # TODO unnecessary mean computation
+        self._mean = data.mean(axis=0)
+        var_x0 = np.asarray(x0.var(axis=0)).flatten()
+        var_x1 = np.asarray(x1.var(axis=0)).flatten()
+        self._var_geq_zero = np.logical_or(
+            var_x0 > 0,
+            var_x1 > 0
+        )
+        idx_run = np.where(np.logical_and(self._mean > 0, self._var_geq_zero == True))[0]
+
+        # TODO: can this be done on sparse?
+        pval = np.zeros([data.shape[1]]) + np.nan
+        if isinstance(x0, xr.DataArray):
+            pval[idx_run] = stats.mann_whitney_u_test(
+                x0=x0.data[:,idx_run],
+                x1=x1.data[:,idx_run]
+            )
+        else:
+            pval[idx_run] = stats.mann_whitney_u_test(
+                x0=np.asarray(x0.X[:,idx_run].todense()),
+                x1=np.asarray(x1.X[:,idx_run].todense())
+            )
+
+        self._pval = pval
+
+        self._logfc = np.log(mean_x1) - np.log(mean_x0).data
         q = self.qval
 
     @property
@@ -921,7 +1087,7 @@ def plot_vs_ttest(self):
 
         sns.scatterplot(x=ttest_pvals, y=self.pval, ax=ax)
 
-        ax.set(xlabel="t-test", ylabel='wilcoxon test')
+        ax.set(xlabel="t-test", ylabel='rank test')
 
         return fig, ax
 
@@ -2029,13 +2195,13 @@ def _continuous_model(self, idx, nonnumeric=False):
         """
         idx = np.asarray(idx)
         if nonnumeric:
-            mu = np.matmul(self._model_estim.design_loc,
+            mu = np.matmul(self._model_estim.design_loc.values,
                            self._model_estim.par_link_loc[:,idx])
             if self._size_factors is not None:
                 mu = mu + self._size_factors
         else:
             idx_basis = self._spline_par_loc_idx(intercept=True)
-            mu = np.matmul(self._model_estim.design_loc[:,idx_basis],
+            mu = np.matmul(self._model_estim.design_loc[:,idx_basis].values,
                            self._model_estim.par_link_loc[idx_basis, idx])
 
         mu = np.exp(mu)
@@ -2322,7 +2488,7 @@ def __init__(self,
 
 def _parse_gene_names(data, gene_names):
     if gene_names is None:
-        if anndata is not None and isinstance(data, anndata.AnnData):
+        if anndata is not None and (isinstance(data, anndata.AnnData) or isinstance(data, anndata.base.Raw)):
             gene_names = data.var_names
         elif isinstance(data, xr.DataArray):
             gene_names = data["features"]
@@ -2358,8 +2524,14 @@ def _parse_sample_description(data, sample_description=None) -> pd.DataFrame:
                 "Please specify `sample_description` or provide `data` as xarray.Dataset or anndata.AnnData " +
                 "with corresponding sample annotations"
             )
-    assert data.shape[0] == sample_description.shape[
-        0], "data matrix and sample description must contain same number of cells"
+
+    if anndata is not None and isinstance(data, anndata.base.Raw):
+        # anndata.base.Raw does not have attribute shape.
+        assert data.X.shape[0] == sample_description.shape[0], \
+            "data matrix and sample description must contain same number of cells"
+    else:
+        assert data.shape[0] == sample_description.shape[0], \
+            "data matrix and sample description must contain same number of cells"
     return sample_description
 
 
@@ -2439,7 +2611,6 @@ def _fit(
         init_model=None,
         init_a: Union[np.ndarray, str] = "AUTO",
         init_b: Union[np.ndarray, str] = "AUTO",
-        as_numeric: Union[np.ndarray, list, Tuple] = [],
         gene_names=None,
         size_factors=None,
         batch_size: int = None,
@@ -2535,9 +2706,10 @@ def _fit(
         "adagrad": pkg_constants.BATCHGLM_OPTIM_ADAGRAD,
         "rmsprop": pkg_constants.BATCHGLM_OPTIM_RMSPROP,
         "nr": pkg_constants.BATCHGLM_OPTIM_NEWTON,
-        "irls": pkg_constants.BATCHGLM_OPTIM_IRLS
+        "nr_tr": pkg_constants.BATCHGLM_OPTIM_NEWTON_TR,
+        "irls": pkg_constants.BATCHGLM_OPTIM_IRLS,
+        "irls_tr": pkg_constants.BATCHGLM_OPTIM_IRLS_TR
     }
-    termination_type = pkg_constants.BATCHGLM_TERMINATION_TYPE
 
     if isinstance(training_strategy, str) and training_strategy.lower() == 'bfgs':
         lib_size = np.zeros(data.shape[0])
@@ -2578,7 +2750,8 @@ def _fit(
             init_a=init_a,
             init_b=init_b,
             provide_optimizers=provide_optimizers,
-            termination_type=termination_type,
+            provide_batched=pkg_constants.BATCHGLM_PROVIDE_BATCHED,
+            termination_type=pkg_constants.BATCHGLM_TERMINATION_TYPE,
             dtype=dtype,
             **constructor_args
         )
@@ -2605,9 +2778,9 @@ def _fit(
 
 
 def lrt(
-        data: Union[anndata.AnnData, xr.DataArray, xr.Dataset, np.ndarray],
-        reduced_formula_loc: str = None,
-        full_formula_loc: str = None,
+        data: Union[anndata.AnnData, anndata.base.Raw, xr.DataArray, xr.Dataset, np.ndarray, scipy.sparse.csr_matrix],
+        reduced_formula_loc: str,
+        full_formula_loc: str,
         reduced_formula_scale: str = "~1",
         full_formula_scale: str = "~1",
         as_numeric: Union[List[str], Tuple[str], str] = (),
@@ -2618,7 +2791,7 @@ def lrt(
         noise_model="nb",
         size_factors: np.ndarray = None,
         batch_size: int = None,
-        training_strategy: Union[str, List[Dict[str, object]], Callable] = "AUTO",
+        training_strategy: Union[str, List[Dict[str, object]], Callable] = "DEFAULT",
         quick_scale: bool = False,
         dtype="float64",
         **kwargs
@@ -2703,6 +2876,7 @@ def lrt(
         Should be "float32" for single precision or "float64" for double precision.
     :param kwargs: [Debugging] Additional arguments will be passed to the _fit method.
     """
+    # TODO test nestedness
     if len(kwargs) != 0:
         logger.info("additional kwargs: %s", str(kwargs))
 
@@ -2744,7 +2918,6 @@ def lrt(
         constraints_scale=None,
         init_a=init_a,
         init_b=init_b,
-        as_numeric=as_numeric,
         gene_names=gene_names,
         size_factors=size_factors,
         batch_size=batch_size,
@@ -2763,7 +2936,6 @@ def lrt(
         gene_names=gene_names,
         init_a="init_model",
         init_b="init_model",
-        as_numeric=as_numeric,
         init_model=reduced_model,
         size_factors=size_factors,
         batch_size=batch_size,
@@ -2785,7 +2957,7 @@ def lrt(
 
 
 def wald(
-        data: Union[anndata.AnnData, xr.DataArray, xr.Dataset, np.ndarray],
+        data: Union[anndata.AnnData, anndata.base.Raw, xr.DataArray, xr.Dataset, np.ndarray, scipy.sparse.csr_matrix],
         factor_loc_totest: Union[str, List[str]] = None,
         coef_to_test: Union[str, List[str]] = None,
         formula_loc: str = None,
@@ -3005,7 +3177,6 @@ def wald(
         constraints_scale=constraints_scale,
         init_a=init_a,
         init_b=init_b,
-        as_numeric=as_numeric,
         gene_names=gene_names,
         size_factors=size_factors,
         batch_size=batch_size,
@@ -3033,13 +3204,13 @@ def _parse_grouping(data, sample_description, grouping):
 
 def _split_X(data, grouping):
     groups = np.unique(grouping)
-    x0 = data[grouping == groups[0], :]
-    x1 = data[grouping == groups[1], :]
+    x0 = data[np.where(grouping == groups[0])[0]]
+    x1 = data[np.where(grouping == groups[1])[0]]
     return x0, x1
 
 
 def t_test(
-        data: Union[anndata.AnnData, xr.DataArray, xr.Dataset, np.ndarray],
+        data: Union[anndata.AnnData, anndata.base.Raw, xr.DataArray, xr.Dataset, np.ndarray, scipy.sparse.csr_matrix],
         grouping,
         gene_names=None,
         sample_description=None,
@@ -3059,27 +3230,29 @@ def t_test(
     :param sample_description: optional pandas.DataFrame containing sample annotations
     """
     gene_names = _parse_gene_names(data, gene_names)
-    X: xr.DataArray = _parse_data(data, gene_names)
+    X = _parse_data(data, gene_names)
+    if isinstance(X, SparseXArrayDataSet):
+        X = X.X
     grouping = _parse_grouping(data, sample_description, grouping)
 
     de_test = DifferentialExpressionTestTT(
         data=X.astype(dtype),
         grouping=grouping,
-        gene_ids=gene_names,
+        gene_names=gene_names,
     )
 
     return de_test
 
 
-def wilcoxon(
-        data,
+def rank_test(
+        data: Union[anndata.AnnData, anndata.base.Raw, xr.DataArray, xr.Dataset, np.ndarray, scipy.sparse.csr_matrix],
         grouping,
         gene_names=None,
         sample_description=None,
         dtype="float32"
 ):
     """
-    Perform Wilcoxon rank sum test for differential expression
+    Perform Mann-Whitney rank test (Wilcoxon rank-sum test) for differential expression
     between two groups on adata object for each gene.
 
     :param data: Array-like, xr.DataArray, xr.Dataset or anndata.Anndata object containing observations.
@@ -3092,10 +3265,12 @@ def wilcoxon(
     :param sample_description: optional pandas.DataFrame containing sample annotations
     """
     gene_names = _parse_gene_names(data, gene_names)
-    X: xr.DataArray = _parse_data(data, gene_names)
+    X = _parse_data(data, gene_names)
+    if isinstance(X, SparseXArrayDataSet):
+        X = X.X
     grouping = _parse_grouping(data, sample_description, grouping)
 
-    de_test = DifferentialExpressionTestWilcoxon(
+    de_test = DifferentialExpressionTestRank(
         data=X.astype(dtype),
         grouping=grouping,
         gene_names=gene_names,
@@ -3105,7 +3280,7 @@ def wilcoxon(
 
 
 def two_sample(
-        data: Union[anndata.AnnData, xr.DataArray, xr.Dataset, np.ndarray],
+        data: Union[anndata.AnnData, anndata.base.Raw, xr.DataArray, xr.Dataset, np.ndarray, scipy.sparse.csr_matrix],
         grouping: Union[str, np.ndarray, list],
         as_numeric: Union[List[str], Tuple[str], str] = (),
         test=None,
@@ -3275,7 +3450,7 @@ def two_sample(
             dtype=dtype
         )
     elif test.lower() == 'wilcoxon':
-        de_test = wilcoxon(
+        de_test = rank_test(
             data=X,
             gene_names=gene_names,
             grouping=grouping,
@@ -3288,7 +3463,7 @@ def two_sample(
 
 
 def pairwise(
-        data: Union[anndata.AnnData, xr.DataArray, xr.Dataset, np.ndarray],
+        data: Union[anndata.AnnData, anndata.base.Raw, xr.DataArray, xr.Dataset, np.ndarray, scipy.sparse.csr_matrix],
         grouping: Union[str, np.ndarray, list],
         as_numeric: Union[List[str], Tuple[str], str] = [],
         test: str = 'z-test',
@@ -3506,7 +3681,7 @@ def pairwise(
 
 
 def versus_rest(
-        data: Union[anndata.AnnData, xr.DataArray, xr.Dataset, np.ndarray],
+        data: Union[anndata.AnnData, anndata.base.Raw, xr.DataArray, xr.Dataset, np.ndarray, scipy.sparse.csr_matrix],
         grouping: Union[str, np.ndarray, list],
         as_numeric: Union[List[str], Tuple[str], str] = (),
         test: str = 'wald',
@@ -3671,7 +3846,7 @@ def versus_rest(
 
 
 def partition(
-        data: Union[anndata.AnnData, xr.DataArray, xr.Dataset, np.ndarray],
+        data: Union[anndata.AnnData, anndata.base.Raw, xr.DataArray, xr.Dataset, np.ndarray, scipy.sparse.csr_matrix],
         partition: Union[str, np.ndarray, list],
         gene_names: str = None,
         sample_description: pd.DataFrame = None):
@@ -3846,7 +4021,7 @@ def wilcoxon(
         """
         DETestsSingle = []
         for i, idx in enumerate(self.partition_idx):
-            DETestsSingle.append(wilcoxon(
+            DETestsSingle.append(rank_test(
                 data=self.X[idx, :],
                 grouping=grouping,
                 gene_names=self.gene_names,
@@ -3927,7 +4102,6 @@ def lrt(
                 full_formula_loc=full_formula_loc,
                 reduced_formula_scale=reduced_formula_scale,
                 full_formula_scale=full_formula_scale,
-                as_numeric=as_numeric,
                 gene_names=self.gene_names,
                 sample_description=self.sample_description.iloc[idx, :],
                 noise_model=noise_model,
@@ -4027,7 +4201,7 @@ def wald(
 
 
 def continuous_1d(
-        data: Union[anndata.AnnData, xr.DataArray, xr.Dataset, np.ndarray],
+        data: Union[anndata.AnnData, anndata.base.Raw, xr.DataArray, xr.Dataset, np.ndarray, scipy.sparse.csr_matrix],
         continuous: str,
         df: int = 5,
         factor_loc_totest: Union[str, List[str]] = None,
@@ -4190,11 +4364,6 @@ def continuous_1d(
     # Note that the brackets around formula_term_continuous propagate the sum
     # across interaction terms.
     formula_term_continuous = '(' + formula_extension + ')'
-    if formula is not None:
-        formula_new = formula.split(continuous)
-        formula_new = formula_term_continuous.join(formula_new)
-    else:
-        formula_new = None
 
     if formula_loc is not None:
         formula_loc_new = formula_loc.split(continuous)
diff --git a/diffxpy/unit_test/test_data_types.py b/diffxpy/unit_test/test_data_types.py
index 3699c04..8381c3d 100644
--- a/diffxpy/unit_test/test_data_types.py
+++ b/diffxpy/unit_test/test_data_types.py
@@ -3,7 +3,6 @@
 
 import numpy as np
 import pandas as pd
-import scipy.stats as stats
 import scipy.sparse
 import anndata
 
@@ -11,22 +10,53 @@
 import diffxpy.api as de
 
 
-class TestDataTypes(unittest.TestCase):
+class TestDataTypesSingle(unittest.TestCase):
 
-    def test_sparse_anndata(self, n_cells: int = 2000, n_genes: int = 100):
-        """
-        Test if de.wald() generates a uniform p-value distribution
-        if it is given data simulated based on the null model. Returns the p-value
-        of the two-side Kolmgorov-Smirnov test for equality of the observed
-        p-value distribution and a uniform distribution.
+    def _test_wald(self, data, sample_description, gene_names=None):
+        test = de.test.wald(
+            data=data,
+            factor_loc_totest="condition",
+            formula_loc="~ 1 + condition",
+            sample_description=sample_description,
+            gene_names=gene_names,
+            quick_scale=True,
+            training_strategy="DEFAULT",
+            dtype="float64"
+        )
+        summary = test.summary()
 
-        :param n_cells: Number of cells to simulate (number of observations per test).
-        :param n_genes: Number of genes to simulate (number of tests).
-        """
-        logging.getLogger("tensorflow").setLevel(logging.ERROR)
-        logging.getLogger("batchglm").setLevel(logging.WARNING)
-        logging.getLogger("diffxpy").setLevel(logging.WARNING)
+    def _test_lrt(self, data, sample_description, gene_names=None):
+        test = de.test.lrt(
+            data=data,
+            full_formula_loc="~ 1 + condition",
+            reduced_formula_loc="~ 1",
+            sample_description=sample_description,
+            gene_names=gene_names,
+            quick_scale=True,
+            training_strategy="DEFAULT",
+            dtype="float64"
+        )
+        summary = test.summary()
 
+    def _test_t_test(self, data, sample_description, gene_names=None):
+        test = de.test.t_test(
+            data=data,
+            grouping="condition",
+            sample_description=sample_description,
+            gene_names=gene_names
+        )
+        summary = test.summary()
+
+    def _test_rank(self, data, sample_description, gene_names=None):
+        test = de.test.rank_test(
+            data=data,
+            grouping="condition",
+            sample_description=sample_description,
+            gene_names=gene_names
+        )
+        summary = test.summary()
+
+    def simulate(self, n_cells: int = 20, n_genes: int = 2):
         sim = Simulator(num_observations=n_cells, num_features=n_genes)
         sim.generate_sample_description(num_batches=0, num_conditions=0)
         sim.generate()
@@ -34,28 +64,87 @@ def test_sparse_anndata(self, n_cells: int = 2000, n_genes: int = 100):
         random_sample_description = pd.DataFrame({
             "condition": np.random.randint(2, size=sim.num_observations)
         })
+        return sim.X ,random_sample_description
 
-        adata = anndata.AnnData(scipy.sparse.csr_matrix(sim.X.values))
-        # X = adata.X
-        test = de.test.wald(
-            data=adata,
-            factor_loc_totest="condition",
-            formula="~ 1 + condition",
-            sample_description=random_sample_description,
-            quick_scale=True,
-            training_strategy="DEFAULT",
-            dtype="float64"
-        )
-        summary = test.summary()
+    def _test_numpy(self, sparse):
+        data, sample_description = self.simulate()
+        gene_names = data.features
+        data = data.values
+        if sparse:
+            data = scipy.sparse.csr_matrix(data)
+
+        self._test_wald(data=data, sample_description=sample_description, gene_names=gene_names)
+        #self._test_lrt(data=data, sample_description=sample_description, gene_names=gene_names)
+        self._test_t_test(data=data, sample_description=sample_description, gene_names=gene_names)
+        self._test_rank(data=data, sample_description=sample_description, gene_names=gene_names)
+
+    def _test_xarray(self):
+        data, sample_description = self.simulate()
+
+        self._test_wald(data=data, sample_description=sample_description)
+        #self._test_lrt(data=data, sample_description=sample_description)
+        self._test_t_test(data=data, sample_description=sample_description)
+        self._test_rank(data=data, sample_description=sample_description)
+
+    def _test_anndata(self, sparse):
+        data, sample_description = self.simulate()
+        gene_names = [str(x) for x in data.features.values]
+        data = data.values
+        if sparse:
+            data = scipy.sparse.csr_matrix(data)
+
+        data = anndata.AnnData(data)
+        data.var_names = gene_names
+        self._test_wald(data=data, sample_description=sample_description)
+        #self._test_lrt(data=data, sample_description=sample_description)
+        self._test_t_test(data=data, sample_description=sample_description)
+        self._test_rank(data=data, sample_description=sample_description)
 
-        # Compare p-value distribution under null model against uniform distribution.
-        pval_h0 = stats.kstest(test.pval, 'uniform').pvalue
+    def _test_anndata_raw(self, sparse):
+        data, sample_description = self.simulate()
+        gene_names = [str(x) for x in data.features.values]
+        data = data.values
+        if sparse:
+            data = scipy.sparse.csr_matrix(data)
 
-        logging.getLogger("diffxpy").info('KS-test pvalue for null model match of wald(): %f' % pval_h0)
-        assert pval_h0 > 0.05, "KS-Test failed: pval_h0 is <= 0.05!"
+        data = anndata.AnnData(data)
+        data.var_names = gene_names
+        data.raw = data
+        self._test_wald(data=data.raw, sample_description=sample_description)
+        #self._test_lrt(data=data.raw, sample_description=sample_description)
+        self._test_t_test(data=data, sample_description=sample_description)
+        self._test_rank(data=data, sample_description=sample_description)
+
+    def test_numpy(self):
+        logging.getLogger("tensorflow").setLevel(logging.ERROR)
+        logging.getLogger("batchglm").setLevel(logging.WARNING)
+        logging.getLogger("diffxpy").setLevel(logging.WARNING)
+
+        self._test_numpy(sparse=False)
+        self._test_numpy(sparse=True)
 
         return True
 
+    def test_xarray(self):
+        logging.getLogger("tensorflow").setLevel(logging.ERROR)
+        logging.getLogger("batchglm").setLevel(logging.WARNING)
+        logging.getLogger("diffxpy").setLevel(logging.WARNING)
+
+        self._test_xarray()
+
+        return True
+
+    def test_anndata(self):
+        logging.getLogger("tensorflow").setLevel(logging.ERROR)
+        logging.getLogger("batchglm").setLevel(logging.WARNING)
+        logging.getLogger("diffxpy").setLevel(logging.WARNING)
+
+        self._test_anndata(sparse=False)
+        self._test_anndata(sparse=True)
+        self._test_anndata_raw(sparse=False)
+        self._test_anndata_raw(sparse=True)
+
+        return True
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/diffxpy/unit_test/test_single.py b/diffxpy/unit_test/test_single.py
index cb3ca7a..048ff5b 100644
--- a/diffxpy/unit_test/test_single.py
+++ b/diffxpy/unit_test/test_single.py
@@ -36,7 +36,7 @@ def test_null_distribution_wald(self, n_cells: int = 2000, n_genes: int = 100):
         test = de.test.wald(
             data=sim.X,
             factor_loc_totest="condition",
-            formula="~ 1 + condition + batch",
+            formula_loc="~ 1 + condition + batch",
             sample_description=random_sample_description,
             batch_size=500,
             training_strategy="DEFAULT",
@@ -77,7 +77,7 @@ def test_null_distribution_wald_multi(self, n_cells: int = 2000, n_genes: int =
         test = de.test.wald(
             data=sim.X,
             factor_loc_totest="condition",
-            formula="~ 1 + condition",
+            formula_loc="~ 1 + condition",
             sample_description=random_sample_description,
             training_strategy="DEFAULT",
             dtype="float64"
@@ -194,7 +194,7 @@ def test_null_distribution_wilcoxon(self, n_cells: int = 2000, n_genes: int = 10
             "condition": np.random.randint(2, size=sim.num_observations)
         })
 
-        test = de.test.wilcoxon(
+        test = de.test.rank_test(
             data=sim.X,
             grouping="condition",
             sample_description=random_sample_description,
@@ -253,9 +253,59 @@ def _eval(self, sim, test):
 
         return sim
 
+    def test_wilcoxon_de(self, n_cells: int = 2000, n_genes: int = 100):
+        """
+        Test if de.test.t_test() generates a uniform p-value distribution
+        if it is given data simulated based on the null model.
+
+        :param n_cells: Number of cells to simulate (number of observations per test).
+        :param n_genes: Number of genes to simulate (number of tests).
+        """
+        logging.getLogger("tensorflow").setLevel(logging.ERROR)
+        logging.getLogger("batchglm").setLevel(logging.WARNING)
+        logging.getLogger("diffxpy").setLevel(logging.WARNING)
+
+        sim = self._prepare_data(n_cells=n_cells, n_genes=n_genes)
+
+        test = de.test.rank_test(
+            data=sim.X,
+            grouping="condition",
+            sample_description=sim.sample_description,
+            dtype="float64"
+        )
+
+        self._eval(sim=sim, test=test)
+
+        return True
+
+    def test_t_test_de(self, n_cells: int = 2000, n_genes: int = 100):
+        """
+        Test if de.test.t_test() generates a uniform p-value distribution
+        if it is given data simulated based on the null model.
+
+        :param n_cells: Number of cells to simulate (number of observations per test).
+        :param n_genes: Number of genes to simulate (number of tests).
+        """
+        logging.getLogger("tensorflow").setLevel(logging.ERROR)
+        logging.getLogger("batchglm").setLevel(logging.WARNING)
+        logging.getLogger("diffxpy").setLevel(logging.WARNING)
+
+        sim = self._prepare_data(n_cells=n_cells, n_genes=n_genes)
+
+        test = de.test.t_test(
+            data=sim.X,
+            grouping="condition",
+            sample_description=sim.sample_description,
+            dtype="float64"
+        )
+
+        self._eval(sim=sim, test=test)
+
+        return True
+
     def test_wald_de(self, n_cells: int = 2000, n_genes: int = 100):
         """
-        Test if de.lrt() generates a uniform p-value distribution
+        Test if de.test.wald() generates a uniform p-value distribution
         if it is given data simulated based on the null model.
 
         :param n_cells: Number of cells to simulate (number of observations per test).
@@ -270,7 +320,7 @@ def test_wald_de(self, n_cells: int = 2000, n_genes: int = 100):
         test = de.test.wald(
             data=sim.X,
             factor_loc_totest="condition",
-            formula="~ 1 + condition",
+            formula_loc="~ 1 + condition",
             sample_description=sim.sample_description,
             training_strategy="DEFAULT",
             dtype="float64"
@@ -282,7 +332,7 @@ def test_wald_de(self, n_cells: int = 2000, n_genes: int = 100):
 
     def test_lrt_de(self, n_cells: int = 2000, n_genes: int = 100):
         """
-        Test if de.lrt() generates a uniform p-value distribution
+        Test if de.test.lrt() generates a uniform p-value distribution
         if it is given data simulated based on the null model. Returns the p-value
         of the two-side Kolmgorov-Smirnov test for equality of the observed
         p-value distribution and a uniform distribution.
@@ -312,5 +362,110 @@ def test_lrt_de(self, n_cells: int = 2000, n_genes: int = 100):
         return True
 
 
+class TestSingleExternal(unittest.TestCase):
+
+    def _prepare_data(self, n_cells: int = 2000, n_genes: int = 100):
+        """
+
+        :param n_cells: Number of cells to simulate (number of observations per test).
+        :param n_genes: Number of genes to simulate (number of tests).
+        """
+        sim = Simulator(num_observations=n_cells, num_features=n_genes)
+        sim.generate_sample_description(num_batches=0, num_conditions=2)
+        sim.generate_params(
+            rand_fn_ave=lambda shape: np.random.poisson(500, shape) + 1,
+            rand_fn=lambda shape: np.abs(np.random.uniform(1, 0.5, shape))
+        )
+        sim.generate_data()
+
+        return sim
+
+    def _eval(self, test, ref_pvals):
+        test_pval = test.pval
+        pval_dev = np.abs(test_pval - ref_pvals)
+        log_pval_dev = np.abs(np.log(test_pval+1e-200) - np.log(ref_pvals+1e-200))
+        max_dev = np.max(pval_dev)
+        max_log_dev = np.max(log_pval_dev)
+        mean_dev = np.mean(log_pval_dev)
+        logging.getLogger("diffxpy").info(
+            'maximum absolute p-value deviation: %f' %
+            float(max_dev)
+        )
+        logging.getLogger("diffxpy").info(
+            'maximum absolute log p-value deviation: %f' %
+            float(max_log_dev)
+        )
+        logging.getLogger("diffxpy").info(
+            'mean absolute log p-value deviation: %f' %
+            float(mean_dev)
+        )
+        assert max_dev < 1e-3, "maximum deviation too large"
+        assert max_log_dev < 1e-1, "maximum deviation in log space too large"
+
+    def test_t_test_ref(self, n_cells: int = 2000, n_genes: int = 100):
+        """
+        Test if de.test.t_test() generates the same p-value distribution as scipy t-test.
+
+        :param n_cells: Number of cells to simulate (number of observations per test).
+        :param n_genes: Number of genes to simulate (number of tests).
+        """
+        logging.getLogger("tensorflow").setLevel(logging.ERROR)
+        logging.getLogger("batchglm").setLevel(logging.WARNING)
+        logging.getLogger("diffxpy").setLevel(logging.INFO)
+
+        sim = self._prepare_data(n_cells=n_cells, n_genes=n_genes)
+
+        test = de.test.t_test(
+            data=sim.X,
+            grouping="condition",
+            sample_description=sim.sample_description,
+            dtype="float64"
+        )
+
+        # Run scipy t-tests as a reference.
+        conds = np.unique(sim.sample_description["condition"].values)
+        ind_a = np.where(sim.sample_description["condition"] == conds[0])[0]
+        ind_b = np.where(sim.sample_description["condition"] == conds[1])[0]
+        scipy_pvals = stats.ttest_ind(a=sim.X[ind_a, :], b=sim.X[ind_b, :], axis=0, equal_var=False).pvalue
+
+        self._eval(test=test, ref_pvals=scipy_pvals)
+
+        return True
+
+    def test_wilcoxon_ref(self, n_cells: int = 2000, n_genes: int = 100):
+        """
+        Test if de.test.t_test() generates the same p-value distribution as scipy t-test.
+
+        :param n_cells: Number of cells to simulate (number of observations per test).
+        :param n_genes: Number of genes to simulate (number of tests).
+        """
+        logging.getLogger("tensorflow").setLevel(logging.ERROR)
+        logging.getLogger("batchglm").setLevel(logging.WARNING)
+        logging.getLogger("diffxpy").setLevel(logging.INFO)
+
+        sim = self._prepare_data(n_cells=n_cells, n_genes=n_genes)
+
+        test = de.test.rank_test(
+            data=sim.X,
+            grouping="condition",
+            sample_description=sim.sample_description,
+            dtype="float64"
+        )
+
+        # Run scipy t-tests as a reference.
+        conds = np.unique(sim.sample_description["condition"].values)
+        ind_a = np.where(sim.sample_description["condition"] == conds[0])[0]
+        ind_b = np.where(sim.sample_description["condition"] == conds[1])[0]
+        scipy_pvals = np.array([
+            stats.mannwhitneyu(x=sim.X[ind_a, i], y=sim.X[ind_b, i],
+                               use_continuity=True, alternative="two-sided").pvalue
+            for i in range(sim.X.shape[1])
+            ])
+
+        self._eval(test=test, ref_pvals=scipy_pvals)
+
+        return True
+
+
 if __name__ == '__main__':
     unittest.main()
diff --git a/diffxpy/unit_test/test_stats.py b/diffxpy/unit_test/test_stats.py
index f88779a..d517609 100644
--- a/diffxpy/unit_test/test_stats.py
+++ b/diffxpy/unit_test/test_stats.py
@@ -128,7 +128,7 @@ def test_wilcoxon(self, n: int = 1000, n_test: int = 100):
         x1 = np.vstack([np.random.normal(loc=locs[i], scale=scales[i], size=n_test) for i in range(n)]).T
         
         # Compute p-value distribution under null model.
-        pvals = de.stats.wilcoxon_test(x0=x0, x1=x1)
+        pvals = de.stats.mann_whitney_u_test(x0=x0, x1=x1)
         
         # Compare p-value distribution under null model against uniform distribution.
         pval_h0 = stats.kstest(pvals, 'uniform').pvalue