v1.0.3 adapt to scanpy v1.10

jsxlei · Apr 16, 2024 · 7cef922 · 7cef922
1 parent 1e3952d
commit 7cef922
Show file tree

Hide file tree

Showing 6 changed files with 43 additions and 24 deletions.
diff --git a/SCALEX.py b/SCALEX.py
@@ -47,6 +47,7 @@
     parser.add_argument('--assess', action='store_true')
     parser.add_argument('--eval', action='store_true')
     parser.add_argument('--num_workers', type=int, default=4)
+    parser.add_argument('--keep_mt', action='store_true')
     # parser.add_argument('--version', type=int, default=2)
     # parser.add_argument('--k', type=str, default=30)
     # parser.add_argument('--embed', type=str, default='UMAP')
@@ -70,6 +71,7 @@
         fraction=args.fraction,
         n_obs=args.n_obs,
         processed=args.processed,
+        keep_mt=args.keep_mt,
         use_layer=args.use_layer,
         backed=args.backed,
         batch_size=args.batch_size, 

diff --git a/docs/source/news.rst b/docs/source/news.rst
@@ -2,5 +2,5 @@ News
 =====
 .. role:: small
 
-SCALEX is online on `Nature Communications <https://www.nature.com/articles/s41467-022-33758-z>`_ :small:`2022-10-17`  
+SCALEX is online on `Nature Communications <https://www.nature.com/articles/s41467-022-33758-z>`_ :small:`2022-10-17`     
 SCALEX is available on `bioRxiv <https://www.biorxiv.org/content/10.1101/2021.04.06.438536v1>`_ :small:`2021-04-09`
diff --git a/requirements.txt b/requirements.txt
@@ -1,12 +1,12 @@
-numpy>=1.17.2
-pandas>=0.25.1
-scipy>=1.3.1
-scikit-learn>=0.22.1
-torch>=1.0.0
-scanpy>=1.4.5
-tqdm>=4.28.1
-matplotlib>=3.0.3
-seaborn>=0.9.0
+numpy>=1.26.4
+pandas>=2.2.2
+scipy>=1.13.0
+scikit-learn>=1.4.2
+torch>=2.2.2
+scanpy>=1.10.1
+tqdm>=4.66.2
+matplotlib>=3.8.4
+seaborn>=0.13.2
 leidenalg>=0.8.3
 sphinx_autodoc_typehints
 nbsphinx
diff --git a/scalex/__init__.py b/scalex/__init__.py
@@ -1,7 +1,7 @@
 # from pkg_resources import get_distribution
 
 # __version__ = get_distribution('scalex').version
-__version__ = '1.0.2'
+__version__ = '1.0.3'
 __author__ = 'Lei Xiong'
 __email__ = '[email protected]'
 

diff --git a/scalex/data.py b/scalex/data.py
@@ -197,6 +197,7 @@ def preprocessing_rna(
         target_sum: int = 10000, 
         n_top_features = 2000, # or gene list
         chunk_size: int = CHUNK_SIZE,
+        keep_mt: bool = False,
         backed: bool = False,
         log=None
     ):
@@ -233,8 +234,10 @@ def preprocessing_rna(
     if type(adata.X) != csr.csr_matrix and (not backed) and (not adata.isbacked):
         adata.X = scipy.sparse.csr_matrix(adata.X)
 
-    adata = adata[:, [gene for gene in adata.var_names 
-                  if not str(gene).startswith(tuple(['ERCC', 'MT-', 'mt-']))]]
+    if not keep_mt:
+        if log: log.info('Filtering out MT genes')
+        adata = adata[:, [gene for gene in adata.var_names 
+                    if not str(gene).startswith(tuple(['ERCC', 'MT-', 'mt-']))]]
 
     if log: log.info('Filtering cells')
     sc.pp.filter_cells(adata, min_genes=min_features)
@@ -251,7 +254,8 @@ def preprocessing_rna(
     adata.raw = adata
     if log: log.info('Finding variable features')
     if type(n_top_features) == int and n_top_features>0:
-        sc.pp.highly_variable_genes(adata, n_top_genes=n_top_features, batch_key='batch', inplace=False, subset=True)
+        sc.pp.highly_variable_genes(adata, n_top_genes=n_top_features, batch_key='batch') #, inplace=False, subset=True)
+        adata = adata[:, adata.var.highly_variable].copy()
     elif type(n_top_features) != int:
         adata = reindex(adata, n_top_features)
 
@@ -344,6 +348,7 @@ def preprocessing(
         min_cells: int = 3, 
         target_sum: int = None, 
         n_top_features = None, # or gene list
+        keep_mt: bool = False,
         backed: bool = False,
         chunk_size: int = CHUNK_SIZE,
         log=None
@@ -382,6 +387,7 @@ def preprocessing(
                    min_cells=min_cells, 
                    target_sum=target_sum,
                    n_top_features=n_top_features, 
+                   keep_mt=keep_mt,
                    backed=backed,
                    chunk_size=chunk_size, 
                    log=log
@@ -482,7 +488,7 @@ def __iter__(self):
         batch = {}
         sampler = np.random.permutation(len(self.batch_id))
         for idx in sampler:
-            c = self.batch_id[idx]
+            c = self.batch_id.iloc[idx]
             if c not in batch:
                 batch[c] = []
             batch[c].append(idx)
@@ -549,6 +555,7 @@ def load_data(
         min_cells=3, 
         target_sum=None,
         n_top_features=None, 
+        keep_mt=False,
         backed=False,
         batch_size=64, 
         chunk_size=CHUNK_SIZE,
@@ -626,6 +633,7 @@ def load_data(
             min_cells=min_cells, 
             target_sum=target_sum,
             n_top_features=n_top_features,
+            keep_mt=keep_mt,
             chunk_size=chunk_size,
             backed=backed,
             log=log,

diff --git a/scalex/function.py b/scalex/function.py
@@ -38,6 +38,7 @@ def SCALEX(
         fraction:float=None,
         n_obs:int=None,
         use_layer:str='X',
+        keep_mt:bool=False,
         backed:bool=False,
         batch_size:int=64, 
         lr:float=2e-4, 
@@ -55,6 +56,7 @@ def SCALEX(
         show:bool=True,
         eval:bool=False,
         num_workers:int=4,
+        cell_type:str='cell_type',
     ) -> AnnData:
     """
     Online single-cell data integration through projecting heterogeneous datasets into a common cell-embedding space
@@ -153,6 +155,7 @@ def SCALEX(
             backed=backed,
             batch_name=batch_name, 
             batch_key=batch_key,
+            keep_mt=keep_mt,
             log=log,
             num_workers=num_workers,
         )
@@ -197,6 +200,7 @@ def SCALEX(
             processed=processed,
             batch_name=batch_name,
             batch_key=batch_key,
+            # keep_mt=keep_mt,
             log = log,
             num_workers=num_workers,
         )
@@ -230,7 +234,8 @@ def SCALEX(
 
         # UMAP visualization
         sc.set_figure_params(dpi=80, figsize=(3,3))
-        cols = ['batch', 'celltype', 'cell_type', 'leiden']
+        cols = [cell_type, 'leiden'] 
+        cols += ['batch'] if n_domain > 1 else []
         color = [c for c in cols if c in adata.obs]
         if outdir:
             sc.settings.figdir = outdir
@@ -243,17 +248,21 @@ def SCALEX(
                 embedding(adata, color='leiden', groupby='projection', save=save, show=show)
             else:
                 sc.pl.umap(adata, color=color, save=save, wspace=0.4, ncols=4, show=show)  
-        if assess:
-            if len(adata.obs['batch'].cat.categories) > 1:
-                entropy_score = batch_entropy_mixing_score(adata.obsm['X_umap'], adata.obs['batch'])
-                log.info('batch_entropy_mixing_score: {:.3f}'.format(entropy_score))
-
-            if 'celltype' in adata.obs:
-                sil_score = silhouette_score(adata.obsm['X_umap'], adata.obs['celltype'].cat.codes)
-                log.info("silhouette_score: {:.3f}".format(sil_score))
 
     if outdir is not None:
         adata.write(os.path.join(outdir, 'adata.h5ad'), compression='gzip')
+
+    if assess:
+        if adata.shape[0] > 5e4:
+            log.info('The number of cells is too large to calculate entropy_batch_mixing_score and silhouette_score')
+            sc.pp.subsample(adata, n_obs=int(5e4))
+        if len(adata.obs['batch'].cat.categories) > 1:
+            entropy_score = batch_entropy_mixing_score(adata.obsm['X_umap'], adata.obs['batch'])
+            log.info('batch_entropy_mixing_score: {:.3f}'.format(entropy_score))
+
+        if cell_type in adata.obs:
+            sil_score = silhouette_score(adata.obsm['X_umap'], adata.obs[cell_type].cat.codes)
+            log.info("silhouette_score: {:.3f}".format(sil_score))
 
     return adata