Merge pull request #9 from Intron7/v0.3.3

V0.3.3
scverse · Jan 18, 2023 · c1edcd4 · c1edcd4
2 parents 2e15090 + 1495fd5
commit c1edcd4
Show file tree

Hide file tree

Showing 14 changed files with 546 additions and 482 deletions.
diff --git a/README.md b/README.md
@@ -15,7 +15,7 @@ ipython kernel install --user --name=rapids_singelcell
 ```
 After you set up the enviroment you can install this package from this wheel into the enviroment. The wheel doesn't install any dependencies
 ```
-pip install https://github.com/Intron7/rapids_singlecell/releases/download/v0.3.2/rapids_singlecell-0.3.2-py3-none-any.whl
+pip install https://github.com/Intron7/rapids_singlecell/releases/download/v0.3.3/rapids_singlecell-0.3.3-py3-none-any.whl
 ```
 
 With this enviroment, you should be able to run the notebooks. So far I have tested these Notebooks on an A100 80GB, a Quadro RTX 6000 and a RTX 3090.

diff --git a/conda/rapids_singecell.yml b/conda/rapids_singecell.yml
@@ -4,7 +4,7 @@ channels:
  - conda-forge
  - bioconda
 dependencies:
- - rapids=22.10
+ - rapids=22.12
  - python=3.9
  - cudatoolkit=11.5
  - cudnn

diff --git a/notebooks/demo_cpu-seuratv3.ipynb b/notebooks/demo_cpu-seuratv3.ipynb
diff --git a/notebooks/demo_cpu.ipynb b/notebooks/demo_cpu.ipynb
diff --git a/notebooks/demo_gpu-seuratv3.ipynb b/notebooks/demo_gpu-seuratv3.ipynb
diff --git a/notebooks/demo_gpu.ipynb b/notebooks/demo_gpu.ipynb
diff --git a/rapids_singlecell/__init__.py b/rapids_singlecell/__init__.py
@@ -3,4 +3,4 @@
 from . import decoupler_gpu as dcg
 from . import scanpy_gpu as tl
 
-__version__ = '0.3.2'
+__version__ = '0.3.3'
diff --git a/rapids_singlecell/cunnData_funcs/_hvg.py b/rapids_singlecell/cunnData_funcs/_hvg.py
@@ -146,7 +146,7 @@ def highly_variable_genes(
         if batch_key is None:
             X = cudata.layers[layer] if layer is not None else cudata.X
             df = _highly_variable_genes_single_batch(
-                X.tocsc(),
+                X.copy(),
                 min_disp=min_disp,
                 max_disp=max_disp,
                 min_mean=min_mean,
@@ -164,7 +164,7 @@ def highly_variable_genes(
                 thr_org = cp.diff(inter_matrix.indptr).ravel()
                 thr = cp.where(thr_org >= 1)[0]
                 thr_2 = cp.where(thr_org < 1)[0]
-                inter_matrix = inter_matrix[:, thr]
+                inter_matrix = inter_matrix[:, thr].tocsr()
                 thr = thr.get()
                 thr_2 = thr_2.get()
                 inter_genes = genes[thr]
@@ -374,7 +374,7 @@ def _highly_variable_genes_seurat_v3(
             UserWarning,
         )
 
-    mean, var = _get_mean_var(X.tocsc())
+    mean, var = _get_mean_var(X)
     df['means'], df['variances'] = mean.get(), var.get()
     if batch_key is None:
         batch_info = pd.Categorical(np.zeros(cudata.shape[0], dtype=int))
@@ -384,7 +384,7 @@ def _highly_variable_genes_seurat_v3(
     norm_gene_vars = []
     for b in np.unique(batch_info):
         X_batch = X[batch_info == b]
-        mean, var = _get_mean_var(X_batch.tocsc())
+        mean, var = _get_mean_var(X_batch)
         not_const = var > 0
         estimat_var = cp.zeros(X_batch.shape[1], dtype=np.float64)
 
@@ -527,7 +527,7 @@ def _highly_variable_pearson_residuals(cudata: cunnData,
     ranks_masked_array = np.ma.masked_invalid(ranks_residual_var)
     # Median rank across batches, ignoring batches in which gene was not selected
     medianrank_residual_var = np.ma.median(ranks_masked_array, axis=0).filled(np.nan)
-    means, variances = _get_mean_var(X.tocsc())
+    means, variances = _get_mean_var(X)
     means, variances = means.get(), variances.get()
     df = pd.DataFrame.from_dict(
         dict(

diff --git a/rapids_singlecell/cunnData_funcs/_normalize.py b/rapids_singlecell/cunnData_funcs/_normalize.py
@@ -1,31 +1,44 @@
 import cupy as cp
 import cupyx as cpx
-import numpy as np
 import math
 import warnings
 from typing import Optional
 
 from ..cunnData import cunnData
 from ._utils import _check_nonnegative_integers
 
-def normalize_total(cudata: cunnData, target_sum):
+def normalize_total(cudata: cunnData, 
+                    target_sum:int,
+                    layer: Optional[str] = None,
+                    inplace = True):
     """
     Normalizes rows in matrix so they sum to `target_sum`
 
     Parameters
     ----------
+    cudata: cunnData object
 
     target_sum : int
         Each row will be normalized to sum to this value
     
+    layer
+        Layer to normalize instead of `X`. If `None`, `X` is normalized.
+
+    inplace: bool
+        Whether to update `cudata` or return the normalized matrix.
+    
     
     Returns
     -------
-    
-    a normalized sparse Matrix to a specified target sum
+    Returns a normalized copy or  updates `cudata` with a normalized version of
+    the original `cudata.X` and `cudata.layers['layer']`, depending on `inplace`.
     
     """
-    csr_arr = cudata.X
+    csr_arr = cudata.layers[layer] if layer is not None else cudata.X
+
+    if not inplace:
+        csr_arr = csr_arr.copy()
+
     mul_kernel = cp.RawKernel(r'''
         extern "C" __global__
         void mul_kernel(const int *indptr, float *data, 
@@ -56,7 +69,13 @@ def normalize_total(cudata: cunnData, target_sum):
                     csr_arr.shape[0],
                     int(target_sum)))
 
-    cudata.X = csr_arr
+    if inplace:
+        if layer:
+            cudata.layers[layer] = csr_arr
+        else:
+            cudata.X = csr_arr
+    else:
+        return csr_arr
 
 def log1p(cudata: cunnData):
     """

diff --git a/rapids_singlecell/cunnData_funcs/_pca.py b/rapids_singlecell/cunnData_funcs/_pca.py
@@ -5,7 +5,6 @@
 from typing import Optional
 
 from cupy.sparse import issparse
-import warnings
 import math
 import numpy as np
 
@@ -112,4 +111,4 @@ def pca(cudata: cunnData,
     else:
         cudata.varm['PCs'] = pca_func.components_.T
 
-
+
diff --git a/rapids_singlecell/cunnData_funcs/_regress_out.py b/rapids_singlecell/cunnData_funcs/_regress_out.py
@@ -2,12 +2,14 @@
 import cupyx as cpx
 from cuml.linear_model import LinearRegression
 from rapids_singlecell.cunnData import cunnData
-from typing import Literal, Union
+from typing import Literal, Union,Optional
 from ..cunnData import cunnData
 import math
 
 def regress_out(cudata:cunnData,
                 keys,
+                layer: Optional[str] = None,
+                inplace = True,
                 batchsize: Union[int,Literal["all"],None] = 100,
                 verbose=False):
 
@@ -16,11 +18,18 @@ def regress_out(cudata:cunnData,
     and variation. 
     Parameters
     ----------
-    adata
-        The annotated data matrix.
+    cudata: cunnData object
+
     keys
         Keys for numerical observation annotation on which to regress on.
     
+    layer
+        Layer to regress instead of `X`. If `None`, `X` is regressed.
+
+    inplace: bool
+        Whether to update `cudata` or return the corrected matrix of
+        `cudata.X` and `cudata.layers`.
+
     batchsize: Union[int,Literal["all"],None] (default: 100)
         Number of genes that should be processed together. 
         If `'all'` all genes will be processed together if `.n_obs` <100000. 
@@ -31,64 +40,72 @@ def regress_out(cudata:cunnData,
         Print debugging information
     Returns
     -------
-    updates cunndata object with the corrected data matrix
+    Returns a corrected copy or  updates `cudata` with a corrected version of the 
+    original `cudata.X` and `cudata.layers['layer']`, depending on `inplace`.
     """
 
     if batchsize != "all" and type(batchsize) not in [int, type(None)]:
         raise ValueError("batchsize must be `int`, `None` or `'all'`")
 
-    if cpx.scipy.sparse.issparse(cudata.X) and not cpx.scipy.sparse.isspmatrix_csc(cudata.X):
-        cudata.X = cudata.X.tocsc()
+    X= cudata.layers[layer] if layer is not None else cudata.X
+
+    if cpx.scipy.sparse.issparse(X) and not cpx.scipy.sparse.isspmatrix_csc(X):
+        X = X.tocsc()
 
     dim_regressor= 2
     if type(keys)is list:
         dim_regressor = len(keys)+1
 
-    regressors = cp.ones((cudata.X.shape[0]*dim_regressor)).reshape((cudata.X.shape[0], dim_regressor), order="F")
+    regressors = cp.ones((X.shape[0]*dim_regressor)).reshape((X.shape[0], dim_regressor), order="F")
     if dim_regressor==2:
         regressors[:, 1] = cp.array(cudata.obs[keys]).ravel()
     else:
         for i in range(dim_regressor-1):
             regressors[:, i+1] = cp.array(cudata.obs[keys[i]]).ravel()
 
-    outputs = cp.empty(cudata.X.shape, dtype=cudata.X.dtype, order="F")
+    outputs = cp.empty(X.shape, dtype=X.dtype, order="F")
 
     cuml_supports_multi_target = LinearRegression._get_tags()['multioutput']
 
     if cuml_supports_multi_target and batchsize:
-        if batchsize == "all" and cudata.X.shape[0] < 100000:
-            if cpx.scipy.sparse.issparse(cudata.X): 
-                cudata.X = cudata.X.todense()
-            X = regressors
+        if batchsize == "all" and X.shape[0] < 100000:
+            if cpx.scipy.sparse.issparse(X): 
+                X = X.todense()
             lr = LinearRegression(fit_intercept=False, output_type="cupy", algorithm='svd')
-            lr.fit(X, cudata.X, convert_dtype=True)
-            outputs[:] = cudata.X - lr.predict(X)
+            lr.fit(regressors, X, convert_dtype=True)
+            outputs[:] = X - lr.predict(regressors)
         else:
             if batchsize == "all":
                 batchsize = 100
-            n_batches = math.ceil(cudata.X.shape[1] / batchsize)
+            n_batches = math.ceil(X.shape[1] / batchsize)
             for batch in range(n_batches):
                 start_idx = batch * batchsize
-                stop_idx = min(batch * batchsize + batchsize, cudata.X.shape[1])
-                if cpx.scipy.sparse.issparse(cudata.X):
-                    arr_batch = cudata.X[:,start_idx:stop_idx].todense()
+                stop_idx = min(batch * batchsize + batchsize, X.shape[1])
+                if cpx.scipy.sparse.issparse(X):
+                    arr_batch = X[:,start_idx:stop_idx].todense()
                 else:
-                    arr_batch = cudata.X[:,start_idx:stop_idx].copy()
-                X = regressors
+                    arr_batch = X[:,start_idx:stop_idx].copy()
                 lr = LinearRegression(fit_intercept=False, output_type="cupy", algorithm='svd')
-                lr.fit(X, arr_batch, convert_dtype=True)
-                outputs[:,start_idx:stop_idx] =arr_batch - lr.predict(X)
+                lr.fit(regressors, arr_batch, convert_dtype=True)
+                outputs[:,start_idx:stop_idx] =arr_batch - lr.predict(regressors)
     else:
-        if cudata.X.shape[0] < 100000 and cpx.scipy.sparse.issparse(cudata.X):
-            cudata.X = cudata.X.todense()
-        for i in range(cudata.X.shape[1]):
+        if X.shape[0] < 100000 and cpx.scipy.sparse.issparse(X):
+            X = X.todense()
+        for i in range(X.shape[1]):
             if verbose and i % 500 == 0:
-                print("Regressed %s out of %s" %(i, cudata.X.shape[1]))
-            X = regressors
-            y = cudata.X[:,i]
-            outputs[:, i] = _regress_out_chunk(X, y)
+                print("Regressed %s out of %s" %(i, X.shape[1]))
 
-    cudata.X= outputs
+            y = X[:,i]
+            outputs[:, i] = _regress_out_chunk(regressors, y)
+
+    if inplace:
+        if layer:
+            cudata.layers[layer] = outputs
+        else:
+            cudata.X = outputs
+    else:
+        return outputs
+
 
 def _regress_out_chunk(X, y):
     """

diff --git a/rapids_singlecell/cunnData_funcs/_scale.py b/rapids_singlecell/cunnData_funcs/_scale.py
@@ -1,28 +1,52 @@
 import cupy as cp
 from ..cunnData import cunnData
+from typing import Optional
 
-def scale(cudata:cunnData, max_value=10):
+
+def scale(cudata:cunnData, 
+        max_value=10,
+        layer: Optional[str] = None,
+        inplace = True):
     """
     Scales matrix to unit variance and clips values
     Parameters
     ----------
+    cudata:
+        cunnData object
+
     max_value : int
-                After scaling matrix to unit variance,
-                values will be clipped to this number
-                of std deviations.
+        After scaling matrix to unit variance,
+        values will be clipped to this number
+        of std deviations.
+
+    layer : Optional[str] (default: None)
+        Layer to use as input instead of X. If None, X is used.
+
+    inplace : bool (default: True)
+        If True, update cunnData with results. Otherwise, return results. See below for details of what is returned.
     Return
     ------
-    updates cunndata object with a scaled cunndata.X
+    Returns a sacled copy or  updates `cudata` with a scaled version of the 
+    original `cudata.X` and `cudata.layers['layer']`, depending on `inplace`.
     """
-    if type(cudata.X) is not cp._core.core.ndarray:
+    X = cudata.layers[layer] if layer is not None else cudata.X
+
+    if type(X) is not cp._core.core.ndarray:
         print("densifying _.X")
-        X = cudata.X.toarray()
+        X = X.toarray()
     else:
-        X =cudata.X
-    mean = X.mean(axis=0)
+        X =X.copy()
+    mean = X.sum(axis=0).flatten() / X.shape[0]
     X -= mean
     del mean
     stddev = cp.sqrt(X.var(axis=0))
     X /= stddev
     del stddev
-    cudata.X = cp.clip(X,a_max=max_value)
+    X= cp.clip(X,a_max=max_value)
+    if inplace:
+        if layer:
+            cudata.layers[layer] = X
+        else:
+            cudata.X = X
+    else:
+        return X
diff --git a/rapids_singlecell/cunnData_funcs/_utils.py b/rapids_singlecell/cunnData_funcs/_utils.py
@@ -1,11 +1,10 @@
 import cupy as cp
 
-
 def _get_mean_var(X):
-    mean = (X.sum(axis =0)/X.shape[0]).ravel()
-    X.data **= 2
-    inter = (X.sum(axis =0)/X.shape[0]).ravel()
-    var = inter - mean ** 2
+    mean = X.sum(axis=0).flatten() / X.shape[0]
+    mean_sq = X.multiply(X).sum(axis=0).flatten() /  X.shape[0]
+    var = mean_sq - mean ** 2
+    var *= X.shape[1]/ ( X.shape[0] - 1)
     return mean, var
 
 def _check_nonnegative_integers(X):

diff --git a/rapids_singlecell/scanpy_gpu/_tsne.py b/rapids_singlecell/scanpy_gpu/_tsne.py
@@ -6,7 +6,7 @@ def tsne(adata: AnnData,
          use_rep:str= None,
          perplexity:int = 30, 
          early_exaggeration:int = 12,
-         learning_rate:int =1000):
+         learning_rate:int =200):
     """
     Performs t-distributed stochastic neighborhood embedding (tSNE) using cuML libraray. Variable description adapted from scanpy and default are the same
     
@@ -22,7 +22,7 @@ def tsne(adata: AnnData,
         The perplexity is related to the number of nearest neighbors that is used in other manifold learning algorithms. Larger datasets usually require a larger perplexity. Consider selecting a value between 5 and 50. The choice is not extremely critical since t-SNE is quite insensitive to this parameter.
     early_exaggeration : float (default:12)
         Controls how tight natural clusters in the original space are in the embedded space and how much space will be between them. For larger values, the space between natural clusters will be larger in the embedded space. Again, the choice of this parameter is not very critical. If the cost function increases during initial optimization, the early exaggeration factor or the learning rate might be too high.
-    learning_rate : float (default:1000)
+    learning_rate : float (default:200)
         Note that the R-package “Rtsne” and cuML uses a default of 200. The learning rate can be a critical parameter. It should be between 100 and 1000. If the cost function increases during initial optimization, the early exaggeration factor or the learning rate might be too high. If the cost function gets stuck in a bad local minimum increasing the learning rate helps sometimes.
     """
     if use_rep == None:
@@ -31,4 +31,4 @@ def tsne(adata: AnnData,
         data = adata.obsm[use_rep]
     if n_pcs is not None:
         data = data[:,:n_pcs]
-    adata.obsm['X_tsne'] = TSNE(perplexity=perplexity, early_exaggeration=early_exaggeration,learning_rate=learning_rate).fit_transform(data)
+    adata.obsm['X_tsne'] = TSNE(perplexity=perplexity, early_exaggeration=early_exaggeration,learning_rate=learning_rate).fit_transform(data)