Skip to content

Commit

Permalink
Merge pull request #9 from Intron7/v0.3.3
Browse files Browse the repository at this point in the history
V0.3.3
  • Loading branch information
Intron7 authored Jan 18, 2023
2 parents 2e15090 + 1495fd5 commit c1edcd4
Show file tree
Hide file tree
Showing 14 changed files with 546 additions and 482 deletions.
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ ipython kernel install --user --name=rapids_singelcell
```
After you set up the enviroment you can install this package from this wheel into the enviroment. The wheel doesn't install any dependencies
```
pip install https://github.com/Intron7/rapids_singlecell/releases/download/v0.3.2/rapids_singlecell-0.3.2-py3-none-any.whl
pip install https://github.com/Intron7/rapids_singlecell/releases/download/v0.3.3/rapids_singlecell-0.3.3-py3-none-any.whl
```

With this enviroment, you should be able to run the notebooks. So far I have tested these Notebooks on an A100 80GB, a Quadro RTX 6000 and a RTX 3090.
Expand Down
2 changes: 1 addition & 1 deletion conda/rapids_singecell.yml
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ channels:
- conda-forge
- bioconda
dependencies:
- rapids=22.10
- rapids=22.12
- python=3.9
- cudatoolkit=11.5
- cudnn
Expand Down
296 changes: 149 additions & 147 deletions notebooks/demo_cpu-seuratv3.ipynb

Large diffs are not rendered by default.

138 changes: 69 additions & 69 deletions notebooks/demo_cpu.ipynb

Large diffs are not rendered by default.

212 changes: 108 additions & 104 deletions notebooks/demo_gpu-seuratv3.ipynb

Large diffs are not rendered by default.

196 changes: 98 additions & 98 deletions notebooks/demo_gpu.ipynb

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion rapids_singlecell/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,4 +3,4 @@
from . import decoupler_gpu as dcg
from . import scanpy_gpu as tl

__version__ = '0.3.2'
__version__ = '0.3.3'
10 changes: 5 additions & 5 deletions rapids_singlecell/cunnData_funcs/_hvg.py
Original file line number Diff line number Diff line change
Expand Up @@ -146,7 +146,7 @@ def highly_variable_genes(
if batch_key is None:
X = cudata.layers[layer] if layer is not None else cudata.X
df = _highly_variable_genes_single_batch(
X.tocsc(),
X.copy(),
min_disp=min_disp,
max_disp=max_disp,
min_mean=min_mean,
Expand All @@ -164,7 +164,7 @@ def highly_variable_genes(
thr_org = cp.diff(inter_matrix.indptr).ravel()
thr = cp.where(thr_org >= 1)[0]
thr_2 = cp.where(thr_org < 1)[0]
inter_matrix = inter_matrix[:, thr]
inter_matrix = inter_matrix[:, thr].tocsr()
thr = thr.get()
thr_2 = thr_2.get()
inter_genes = genes[thr]
Expand Down Expand Up @@ -374,7 +374,7 @@ def _highly_variable_genes_seurat_v3(
UserWarning,
)

mean, var = _get_mean_var(X.tocsc())
mean, var = _get_mean_var(X)
df['means'], df['variances'] = mean.get(), var.get()
if batch_key is None:
batch_info = pd.Categorical(np.zeros(cudata.shape[0], dtype=int))
Expand All @@ -384,7 +384,7 @@ def _highly_variable_genes_seurat_v3(
norm_gene_vars = []
for b in np.unique(batch_info):
X_batch = X[batch_info == b]
mean, var = _get_mean_var(X_batch.tocsc())
mean, var = _get_mean_var(X_batch)
not_const = var > 0
estimat_var = cp.zeros(X_batch.shape[1], dtype=np.float64)

Expand Down Expand Up @@ -527,7 +527,7 @@ def _highly_variable_pearson_residuals(cudata: cunnData,
ranks_masked_array = np.ma.masked_invalid(ranks_residual_var)
# Median rank across batches, ignoring batches in which gene was not selected
medianrank_residual_var = np.ma.median(ranks_masked_array, axis=0).filled(np.nan)
means, variances = _get_mean_var(X.tocsc())
means, variances = _get_mean_var(X)
means, variances = means.get(), variances.get()
df = pd.DataFrame.from_dict(
dict(
Expand Down
31 changes: 25 additions & 6 deletions rapids_singlecell/cunnData_funcs/_normalize.py
Original file line number Diff line number Diff line change
@@ -1,31 +1,44 @@
import cupy as cp
import cupyx as cpx
import numpy as np
import math
import warnings
from typing import Optional

from ..cunnData import cunnData
from ._utils import _check_nonnegative_integers

def normalize_total(cudata: cunnData, target_sum):
def normalize_total(cudata: cunnData,
target_sum:int,
layer: Optional[str] = None,
inplace = True):
"""
Normalizes rows in matrix so they sum to `target_sum`
Parameters
----------
cudata: cunnData object
target_sum : int
Each row will be normalized to sum to this value
layer
Layer to normalize instead of `X`. If `None`, `X` is normalized.
inplace: bool
Whether to update `cudata` or return the normalized matrix.
Returns
-------
a normalized sparse Matrix to a specified target sum
Returns a normalized copy or updates `cudata` with a normalized version of
the original `cudata.X` and `cudata.layers['layer']`, depending on `inplace`.
"""
csr_arr = cudata.X
csr_arr = cudata.layers[layer] if layer is not None else cudata.X

if not inplace:
csr_arr = csr_arr.copy()

mul_kernel = cp.RawKernel(r'''
extern "C" __global__
void mul_kernel(const int *indptr, float *data,
Expand Down Expand Up @@ -56,7 +69,13 @@ def normalize_total(cudata: cunnData, target_sum):
csr_arr.shape[0],
int(target_sum)))

cudata.X = csr_arr
if inplace:
if layer:
cudata.layers[layer] = csr_arr
else:
cudata.X = csr_arr
else:
return csr_arr

def log1p(cudata: cunnData):
"""
Expand Down
3 changes: 1 addition & 2 deletions rapids_singlecell/cunnData_funcs/_pca.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,6 @@
from typing import Optional

from cupy.sparse import issparse
import warnings
import math
import numpy as np

Expand Down Expand Up @@ -112,4 +111,4 @@ def pca(cudata: cunnData,
else:
cudata.varm['PCs'] = pca_func.components_.T



77 changes: 47 additions & 30 deletions rapids_singlecell/cunnData_funcs/_regress_out.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,12 +2,14 @@
import cupyx as cpx
from cuml.linear_model import LinearRegression
from rapids_singlecell.cunnData import cunnData
from typing import Literal, Union
from typing import Literal, Union,Optional
from ..cunnData import cunnData
import math

def regress_out(cudata:cunnData,
keys,
layer: Optional[str] = None,
inplace = True,
batchsize: Union[int,Literal["all"],None] = 100,
verbose=False):

Expand All @@ -16,11 +18,18 @@ def regress_out(cudata:cunnData,
and variation.
Parameters
----------
adata
The annotated data matrix.
cudata: cunnData object
keys
Keys for numerical observation annotation on which to regress on.
layer
Layer to regress instead of `X`. If `None`, `X` is regressed.
inplace: bool
Whether to update `cudata` or return the corrected matrix of
`cudata.X` and `cudata.layers`.
batchsize: Union[int,Literal["all"],None] (default: 100)
Number of genes that should be processed together.
If `'all'` all genes will be processed together if `.n_obs` <100000.
Expand All @@ -31,64 +40,72 @@ def regress_out(cudata:cunnData,
Print debugging information
Returns
-------
updates cunndata object with the corrected data matrix
Returns a corrected copy or updates `cudata` with a corrected version of the
original `cudata.X` and `cudata.layers['layer']`, depending on `inplace`.
"""

if batchsize != "all" and type(batchsize) not in [int, type(None)]:
raise ValueError("batchsize must be `int`, `None` or `'all'`")

if cpx.scipy.sparse.issparse(cudata.X) and not cpx.scipy.sparse.isspmatrix_csc(cudata.X):
cudata.X = cudata.X.tocsc()
X= cudata.layers[layer] if layer is not None else cudata.X

if cpx.scipy.sparse.issparse(X) and not cpx.scipy.sparse.isspmatrix_csc(X):
X = X.tocsc()

dim_regressor= 2
if type(keys)is list:
dim_regressor = len(keys)+1

regressors = cp.ones((cudata.X.shape[0]*dim_regressor)).reshape((cudata.X.shape[0], dim_regressor), order="F")
regressors = cp.ones((X.shape[0]*dim_regressor)).reshape((X.shape[0], dim_regressor), order="F")
if dim_regressor==2:
regressors[:, 1] = cp.array(cudata.obs[keys]).ravel()
else:
for i in range(dim_regressor-1):
regressors[:, i+1] = cp.array(cudata.obs[keys[i]]).ravel()

outputs = cp.empty(cudata.X.shape, dtype=cudata.X.dtype, order="F")
outputs = cp.empty(X.shape, dtype=X.dtype, order="F")

cuml_supports_multi_target = LinearRegression._get_tags()['multioutput']

if cuml_supports_multi_target and batchsize:
if batchsize == "all" and cudata.X.shape[0] < 100000:
if cpx.scipy.sparse.issparse(cudata.X):
cudata.X = cudata.X.todense()
X = regressors
if batchsize == "all" and X.shape[0] < 100000:
if cpx.scipy.sparse.issparse(X):
X = X.todense()
lr = LinearRegression(fit_intercept=False, output_type="cupy", algorithm='svd')
lr.fit(X, cudata.X, convert_dtype=True)
outputs[:] = cudata.X - lr.predict(X)
lr.fit(regressors, X, convert_dtype=True)
outputs[:] = X - lr.predict(regressors)
else:
if batchsize == "all":
batchsize = 100
n_batches = math.ceil(cudata.X.shape[1] / batchsize)
n_batches = math.ceil(X.shape[1] / batchsize)
for batch in range(n_batches):
start_idx = batch * batchsize
stop_idx = min(batch * batchsize + batchsize, cudata.X.shape[1])
if cpx.scipy.sparse.issparse(cudata.X):
arr_batch = cudata.X[:,start_idx:stop_idx].todense()
stop_idx = min(batch * batchsize + batchsize, X.shape[1])
if cpx.scipy.sparse.issparse(X):
arr_batch = X[:,start_idx:stop_idx].todense()
else:
arr_batch = cudata.X[:,start_idx:stop_idx].copy()
X = regressors
arr_batch = X[:,start_idx:stop_idx].copy()
lr = LinearRegression(fit_intercept=False, output_type="cupy", algorithm='svd')
lr.fit(X, arr_batch, convert_dtype=True)
outputs[:,start_idx:stop_idx] =arr_batch - lr.predict(X)
lr.fit(regressors, arr_batch, convert_dtype=True)
outputs[:,start_idx:stop_idx] =arr_batch - lr.predict(regressors)
else:
if cudata.X.shape[0] < 100000 and cpx.scipy.sparse.issparse(cudata.X):
cudata.X = cudata.X.todense()
for i in range(cudata.X.shape[1]):
if X.shape[0] < 100000 and cpx.scipy.sparse.issparse(X):
X = X.todense()
for i in range(X.shape[1]):
if verbose and i % 500 == 0:
print("Regressed %s out of %s" %(i, cudata.X.shape[1]))
X = regressors
y = cudata.X[:,i]
outputs[:, i] = _regress_out_chunk(X, y)
print("Regressed %s out of %s" %(i, X.shape[1]))

cudata.X= outputs
y = X[:,i]
outputs[:, i] = _regress_out_chunk(regressors, y)

if inplace:
if layer:
cudata.layers[layer] = outputs
else:
cudata.X = outputs
else:
return outputs


def _regress_out_chunk(X, y):
"""
Expand Down
44 changes: 34 additions & 10 deletions rapids_singlecell/cunnData_funcs/_scale.py
Original file line number Diff line number Diff line change
@@ -1,28 +1,52 @@
import cupy as cp
from ..cunnData import cunnData
from typing import Optional

def scale(cudata:cunnData, max_value=10):

def scale(cudata:cunnData,
max_value=10,
layer: Optional[str] = None,
inplace = True):
"""
Scales matrix to unit variance and clips values
Parameters
----------
cudata:
cunnData object
max_value : int
After scaling matrix to unit variance,
values will be clipped to this number
of std deviations.
After scaling matrix to unit variance,
values will be clipped to this number
of std deviations.
layer : Optional[str] (default: None)
Layer to use as input instead of X. If None, X is used.
inplace : bool (default: True)
If True, update cunnData with results. Otherwise, return results. See below for details of what is returned.
Return
------
updates cunndata object with a scaled cunndata.X
Returns a sacled copy or updates `cudata` with a scaled version of the
original `cudata.X` and `cudata.layers['layer']`, depending on `inplace`.
"""
if type(cudata.X) is not cp._core.core.ndarray:
X = cudata.layers[layer] if layer is not None else cudata.X

if type(X) is not cp._core.core.ndarray:
print("densifying _.X")
X = cudata.X.toarray()
X = X.toarray()
else:
X =cudata.X
mean = X.mean(axis=0)
X =X.copy()
mean = X.sum(axis=0).flatten() / X.shape[0]
X -= mean
del mean
stddev = cp.sqrt(X.var(axis=0))
X /= stddev
del stddev
cudata.X = cp.clip(X,a_max=max_value)
X= cp.clip(X,a_max=max_value)
if inplace:
if layer:
cudata.layers[layer] = X
else:
cudata.X = X
else:
return X
9 changes: 4 additions & 5 deletions rapids_singlecell/cunnData_funcs/_utils.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,10 @@
import cupy as cp


def _get_mean_var(X):
mean = (X.sum(axis =0)/X.shape[0]).ravel()
X.data **= 2
inter = (X.sum(axis =0)/X.shape[0]).ravel()
var = inter - mean ** 2
mean = X.sum(axis=0).flatten() / X.shape[0]
mean_sq = X.multiply(X).sum(axis=0).flatten() / X.shape[0]
var = mean_sq - mean ** 2
var *= X.shape[1]/ ( X.shape[0] - 1)
return mean, var

def _check_nonnegative_integers(X):
Expand Down
6 changes: 3 additions & 3 deletions rapids_singlecell/scanpy_gpu/_tsne.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ def tsne(adata: AnnData,
use_rep:str= None,
perplexity:int = 30,
early_exaggeration:int = 12,
learning_rate:int =1000):
learning_rate:int =200):
"""
Performs t-distributed stochastic neighborhood embedding (tSNE) using cuML libraray. Variable description adapted from scanpy and default are the same
Expand All @@ -22,7 +22,7 @@ def tsne(adata: AnnData,
The perplexity is related to the number of nearest neighbors that is used in other manifold learning algorithms. Larger datasets usually require a larger perplexity. Consider selecting a value between 5 and 50. The choice is not extremely critical since t-SNE is quite insensitive to this parameter.
early_exaggeration : float (default:12)
Controls how tight natural clusters in the original space are in the embedded space and how much space will be between them. For larger values, the space between natural clusters will be larger in the embedded space. Again, the choice of this parameter is not very critical. If the cost function increases during initial optimization, the early exaggeration factor or the learning rate might be too high.
learning_rate : float (default:1000)
learning_rate : float (default:200)
Note that the R-package “Rtsne” and cuML uses a default of 200. The learning rate can be a critical parameter. It should be between 100 and 1000. If the cost function increases during initial optimization, the early exaggeration factor or the learning rate might be too high. If the cost function gets stuck in a bad local minimum increasing the learning rate helps sometimes.
"""
if use_rep == None:
Expand All @@ -31,4 +31,4 @@ def tsne(adata: AnnData,
data = adata.obsm[use_rep]
if n_pcs is not None:
data = data[:,:n_pcs]
adata.obsm['X_tsne'] = TSNE(perplexity=perplexity, early_exaggeration=early_exaggeration,learning_rate=learning_rate).fit_transform(data)
adata.obsm['X_tsne'] = TSNE(perplexity=perplexity, early_exaggeration=early_exaggeration,learning_rate=learning_rate).fit_transform(data)

0 comments on commit c1edcd4

Please sign in to comment.