diff --git a/bak/npy2csv_script.py b/bak/npy2csv_script.py new file mode 100644 index 0000000..cb35774 --- /dev/null +++ b/bak/npy2csv_script.py @@ -0,0 +1,50 @@ +import numpy as np +import pandas as pd + +def convert(method='dca'): + t=np.load(method+'\\9.Chung_0.0_1_recon.npy') + df = pd.DataFrame(t) + df.to_csv(method+'_9.csv',header=None,index=False) + + t=np.load(method+'\\11.Kolodziejczyk_0.0_1_recon.npy') + df = pd.DataFrame(t) + df.to_csv(method+'_11.csv',header=None,index=False) + + t=np.load(method+'\\12.Klein_0.0_1_recon.npy') + df = pd.DataFrame(t) + df.to_csv(method+'_12.csv',header=None,index=False) + + t=np.load(method+'\\13.Zeisel_0.0_1_recon.npy') + df = pd.DataFrame(t) + df.to_csv(method+'_13.csv',header=None,index=False) + +convert('dca') +convert('deepimpute') +convert('magic') +convert('netNMFsc') +convert('saucie') +convert('saver') +convert('scimpute') +convert('scvi') + + +def convertCSV(method='scIGANs'): + df = pd.read_csv(method+'\\9.Chung_0.0_1_recon.csv.txt',sep='\s+',index_col=0) + df = df.T + df.to_csv(method+'_9.csv',header=None,index=False) + + df = pd.read_csv(method+'\\11.Kolodziejczyk_0.0_1_recon.csv.txt',sep='\s+',index_col=0) + df = df.T + df.to_csv(method+'_11.csv',header=None,index=False) + + df = pd.read_csv(method+'\\12.Klein_0.0_1_recon.csv.txt',sep='\s+',index_col=0) + df = df.T + df.to_csv(method+'_12.csv',header=None,index=False) + + df = pd.read_csv(method+'\\13.Zeisel_0.0_1_recon.csv.txt',sep='\s+',index_col=0) + df = df.T + df.to_csv(method+'_13.csv',header=None,index=False) + +convertCSV('scIGANs') + + diff --git a/otherresults/BAK_MAGIC.py b/bak/otherresults/BAK_MAGIC.py similarity index 100% rename from otherresults/BAK_MAGIC.py rename to bak/otherresults/BAK_MAGIC.py diff --git a/otherresults/MAGIC_analysis.sh b/bak/otherresults/MAGIC_analysis.sh similarity index 100% rename from otherresults/MAGIC_analysis.sh rename to bak/otherresults/MAGIC_analysis.sh diff --git a/otherresults/MAGIC_analysis_usage.sh b/bak/otherresults/MAGIC_analysis_usage.sh similarity index 100% rename from otherresults/MAGIC_analysis_usage.sh rename to bak/otherresults/MAGIC_analysis_usage.sh diff --git a/otherresults/MAGIC_impute.py b/bak/otherresults/MAGIC_impute.py similarity index 100% rename from otherresults/MAGIC_impute.py rename to bak/otherresults/MAGIC_impute.py diff --git a/otherresults/MAGIC_impute_usage.py b/bak/otherresults/MAGIC_impute_usage.py similarity index 100% rename from otherresults/MAGIC_impute_usage.py rename to bak/otherresults/MAGIC_impute_usage.py diff --git a/otherresults/Other_Results_Evaluation.sh b/bak/otherresults/Other_Results_Evaluation.sh similarity index 100% rename from otherresults/Other_Results_Evaluation.sh rename to bak/otherresults/Other_Results_Evaluation.sh diff --git a/otherresults/Other_results_Reading.py b/bak/otherresults/Other_results_Reading.py similarity index 100% rename from otherresults/Other_results_Reading.py rename to bak/otherresults/Other_results_Reading.py diff --git a/otherresults/Other_results_celltype.py b/bak/otherresults/Other_results_celltype.py similarity index 100% rename from otherresults/Other_results_celltype.py rename to bak/otherresults/Other_results_celltype.py diff --git a/otherresults/Other_results_impute.py b/bak/otherresults/Other_results_impute.py similarity index 100% rename from otherresults/Other_results_impute.py rename to bak/otherresults/Other_results_impute.py diff --git a/otherresults/README.md b/bak/otherresults/README.md similarity index 100% rename from otherresults/README.md rename to bak/otherresults/README.md diff --git a/otherresults/SAUCIE_analysis.sh b/bak/otherresults/SAUCIE_analysis.sh similarity index 100% rename from otherresults/SAUCIE_analysis.sh rename to bak/otherresults/SAUCIE_analysis.sh diff --git a/otherresults/SAUCIE_celltype.py b/bak/otherresults/SAUCIE_celltype.py similarity index 100% rename from otherresults/SAUCIE_celltype.py rename to bak/otherresults/SAUCIE_celltype.py diff --git a/otherresults/SAUCIE_impute.py b/bak/otherresults/SAUCIE_impute.py similarity index 100% rename from otherresults/SAUCIE_impute.py rename to bak/otherresults/SAUCIE_impute.py diff --git a/otherresults/SAVER_impute.R b/bak/otherresults/SAVER_impute.R similarity index 100% rename from otherresults/SAVER_impute.R rename to bak/otherresults/SAVER_impute.R diff --git a/otherresults/SCIMPUTE_impute.R b/bak/otherresults/SCIMPUTE_impute.R similarity index 100% rename from otherresults/SCIMPUTE_impute.R rename to bak/otherresults/SCIMPUTE_impute.R diff --git a/otherresults/dca_impute.py b/bak/otherresults/dca_impute.py similarity index 100% rename from otherresults/dca_impute.py rename to bak/otherresults/dca_impute.py diff --git a/otherresults/scVi_impute.py b/bak/otherresults/scVi_impute.py similarity index 100% rename from otherresults/scVi_impute.py rename to bak/otherresults/scVi_impute.py diff --git a/otherresults/simulation_generator.R b/bak/otherresults/simulation_generator.R similarity index 100% rename from otherresults/simulation_generator.R rename to bak/otherresults/simulation_generator.R diff --git a/results/calculateROGUE.R b/bak/results/calculateROGUE.R similarity index 100% rename from results/calculateROGUE.R rename to bak/results/calculateROGUE.R diff --git a/results/compare_varID.py b/bak/results/compare_varID.py similarity index 100% rename from results/compare_varID.py rename to bak/results/compare_varID.py diff --git a/results/jobinfo_imp_23dropout.txt b/bak/results/jobinfo_imp_23dropout.txt similarity index 100% rename from results/jobinfo_imp_23dropout.txt rename to bak/results/jobinfo_imp_23dropout.txt diff --git a/results/jobinfo_imp_explore.txt b/bak/results/jobinfo_imp_explore.txt similarity index 100% rename from results/jobinfo_imp_explore.txt rename to bak/results/jobinfo_imp_explore.txt diff --git a/results/jobinfo_imp_louvain_2.txt b/bak/results/jobinfo_imp_louvain_2.txt similarity index 100% rename from results/jobinfo_imp_louvain_2.txt rename to bak/results/jobinfo_imp_louvain_2.txt diff --git a/results/results_ROGUE.py b/bak/results/results_ROGUE.py similarity index 100% rename from results/results_ROGUE.py rename to bak/results/results_ROGUE.py diff --git a/results/results_Reading.py b/bak/results/results_Reading.py similarity index 99% rename from results/results_Reading.py rename to bak/results/results_Reading.py index 88f34aa..50ebc2b 100644 --- a/results/results_Reading.py +++ b/bak/results/results_Reading.py @@ -13,6 +13,7 @@ args = parser.parse_args() # Note: +# Main Check results # Generate results in python other than in shell for better organization # We are not use runpy.run_path('main_result.py') for it is hard to pass arguments # We are not use subprocess.call("python main_result.py", shell=True) for it runs scripts parallel diff --git a/results/results_Reading_23.py b/bak/results/results_Reading_23.py similarity index 100% rename from results/results_Reading_23.py rename to bak/results/results_Reading_23.py diff --git a/results/results_Reading_23dropout.py b/bak/results/results_Reading_23dropout.py similarity index 100% rename from results/results_Reading_23dropout.py rename to bak/results/results_Reading_23dropout.py diff --git a/results/results_Reading_explore.py b/bak/results/results_Reading_explore.py similarity index 100% rename from results/results_Reading_explore.py rename to bak/results/results_Reading_explore.py diff --git a/results/results_Reading_graph.py b/bak/results/results_Reading_graph.py similarity index 100% rename from results/results_Reading_graph.py rename to bak/results/results_Reading_graph.py diff --git a/results/results_imputation.sh b/bak/results/results_imputation.sh similarity index 100% rename from results/results_imputation.sh rename to bak/results/results_imputation.sh diff --git a/results/results_imputation_0.3.sh b/bak/results/results_imputation_0.3.sh similarity index 100% rename from results/results_imputation_0.3.sh rename to bak/results/results_imputation_0.3.sh diff --git a/results/results_imputation_grid.sh b/bak/results/results_imputation_grid.sh similarity index 100% rename from results/results_imputation_grid.sh rename to bak/results/results_imputation_grid.sh diff --git a/results/results_impute.py b/bak/results/results_impute.py similarity index 94% rename from results/results_impute.py rename to bak/results/results_impute.py index 61796dc..f265477 100644 --- a/results/results_impute.py +++ b/bak/results/results_impute.py @@ -56,8 +56,8 @@ dropix = np.load(args.npyDir+args.datasetName+'_'+args.regulized_type+discreteStr+'_'+args.ratio+'_dropix.npy') featuresImpute = np.load(args.npyDir+args.datasetName+'_'+args.regulized_type+discreteStr+'_'+args.ratio+'_recon'+args.reconstr+'.npy') -l1ErrorMean, l1ErrorMedian, l1ErrorMin, l1ErrorMax = imputation_error_log(featuresImpute, featuresOriginal, features, dropi, dropj, dropix) -print('{:.4f} {:.4f} {:.4f} {:.4f} '.format(l1ErrorMean, l1ErrorMedian, l1ErrorMin, l1ErrorMax), end='') +l1ErrorMean, l1ErrorMedian, l1ErrorMin, l1ErrorMax, rmse = imputation_error_log(featuresImpute, featuresOriginal, features, dropi, dropj, dropix) +print('{:.4f} {:.4f} {:.4f} {:.4f} {:.4f} '.format(l1ErrorMean, l1ErrorMedian, l1ErrorMin, l1ErrorMax, rmse), end='') def imputeResult(inputData): ''' diff --git a/results/results_impute_graph_ROC.py b/bak/results/results_impute_graph_ROC.py similarity index 100% rename from results/results_impute_graph_ROC.py rename to bak/results/results_impute_graph_ROC.py diff --git a/results/results_impute_graph_ROC.sh b/bak/results/results_impute_graph_ROC.sh similarity index 100% rename from results/results_impute_graph_ROC.sh rename to bak/results/results_impute_graph_ROC.sh diff --git a/results/submitCluster_Result_Celltype.sh b/bak/results/submitCluster_Result_Celltype.sh similarity index 100% rename from results/submitCluster_Result_Celltype.sh rename to bak/results/submitCluster_Result_Celltype.sh diff --git a/results/submitCluster_Result_Impute.sh b/bak/results/submitCluster_Result_Impute.sh similarity index 100% rename from results/submitCluster_Result_Impute.sh rename to bak/results/submitCluster_Result_Impute.sh diff --git a/results/submitCluster_Result_Impute_23.sh b/bak/results/submitCluster_Result_Impute_23.sh similarity index 100% rename from results/submitCluster_Result_Impute_23.sh rename to bak/results/submitCluster_Result_Impute_23.sh diff --git a/results/submitCluster_Result_Impute_23dropout.sh b/bak/results/submitCluster_Result_Impute_23dropout.sh similarity index 100% rename from results/submitCluster_Result_Impute_23dropout.sh rename to bak/results/submitCluster_Result_Impute_23dropout.sh diff --git a/results/submitCluster_Result_Impute_explore.sh b/bak/results/submitCluster_Result_Impute_explore.sh similarity index 100% rename from results/submitCluster_Result_Impute_explore.sh rename to bak/results/submitCluster_Result_Impute_explore.sh diff --git a/results/submitCluster_Result_Impute_graph.sh b/bak/results/submitCluster_Result_Impute_graph.sh similarity index 100% rename from results/submitCluster_Result_Impute_graph.sh rename to bak/results/submitCluster_Result_Impute_graph.sh diff --git a/results/summary.sh b/bak/results/summary.sh similarity index 100% rename from results/summary.sh rename to bak/results/summary.sh diff --git a/results/summary_cmd.py b/bak/results/summary_cmd.py similarity index 100% rename from results/summary_cmd.py rename to bak/results/summary_cmd.py diff --git a/benchmark_util.py b/benchmark_util.py index d85d409..d2fc1ba 100644 --- a/benchmark_util.py +++ b/benchmark_util.py @@ -530,6 +530,7 @@ def imputation_error(X_mean, X, X_zero, i, j, ix): all_index = i[ix], j[ix] x, y = X_mean[all_index], X[all_index] result = np.abs(x - y) + rmse = ((x - y)**2/len(result))**0.5 # If the input is a sparse matrix else: all_index = i[ix], j[ix] @@ -538,8 +539,9 @@ def imputation_error(X_mean, X, X_zero, i, j, ix): yuse = scipy.sparse.lil_matrix.todense(y) yuse = np.asarray(yuse).reshape(-1) result = np.abs(x - yuse) + rmse = ((x - yuse)**2/len(result))**0.5 # return np.median(np.abs(x - yuse)) - return np.mean(result), np.median(result), np.min(result), np.max(result) + return np.mean(result), np.median(result), np.min(result), np.max(result), np.mean(rmse) # IMPUTATION METRICS @@ -562,6 +564,7 @@ def imputation_error_log(X_mean, X, X_zero, i, j, ix): all_index = i[ix], j[ix] x, y = X_mean[all_index], X[all_index] result = np.abs(x - np.log(y+1)) + rmse = ((x - np.log(y+1))**2/len(result))**0.5 # If the input is a sparse matrix else: all_index = i[ix], j[ix] @@ -570,10 +573,11 @@ def imputation_error_log(X_mean, X, X_zero, i, j, ix): yuse = scipy.sparse.lil_matrix.todense(y) yuse = np.asarray(yuse).reshape(-1) result = np.abs(x - np.log(yuse+1)) + rmse = ((x - np.log(yuse+1))**2/len(result))**0.5 # return np.median(np.abs(x - yuse)) - return np.mean(result), np.median(result), np.min(result), np.max(result) + return np.mean(result), np.median(result), np.min(result), np.max(result), np.mean(rmse) -# cosine similarity +# cosine similarity with log def imputation_cosine_log(X_mean, X, X_zero, i, j, ix): """ X_mean: imputed dataset diff --git a/codesfromJGandYJ/codeForCellcluster/Run_netNMF_celltype.py b/codesfromJGandYJ/codeForCellcluster/Run_netNMF_celltype.py new file mode 100644 index 0000000..19e5b1f --- /dev/null +++ b/codesfromJGandYJ/codeForCellcluster/Run_netNMF_celltype.py @@ -0,0 +1,71 @@ +# This code has not cleaned yet +# run netNMF-sc from command line and save outputs to specified directory +from __future__ import print_function +import numpy as np +from warnings import warn +from joblib import Parallel, delayed +import copy,argparse,os,math,random,time +from scipy import sparse, io,linalg +from scipy.sparse import csr_matrix +import warnings,os +from netNMFsc import plot +warnings.simplefilter(action='ignore', category=FutureWarning) +import pandas as pd + +def main(args): + if args.method == 'GD': + from netNMFsc import netNMFGD + operator = netNMFGD(d=args.dimensions, alpha=args.alpha, n_inits=1, tol=args.tol, max_iter=args.max_iters, n_jobs=1) + elif args.method == 'MU': + from netNMFsc import netNMFMU + operator = netNMFMU(d=args.dimensions, alpha=args.alpha, n_inits=1, tol=args.tol, max_iter=args.max_iters, n_jobs=1) + + + chung = pd.read_csv(args.filename, header=0, + index_col=0, sep=',') + X = chung.values + genes = [] + for gen in chung.index.values: + if '.' in gen: + genes.append(gen.upper().split('.')[0]) + else: + genes.append(gen.upper()) + #print(genes) + operator.X = X + operator.genes = np.asarray(genes) + #operator.load_10X(direc=args.tenXdir,genome='mm10') + operator.load_network(net=args.network,genenames=args.netgenes,sparsity=args.sparsity) + dictW = operator.fit_transform() + W, H = dictW['W'], dictW['H'] + k,clusters = plot.select_clusters(H,max_clusters=20) + plot.tSNE(H,clusters,fname=args.direc + '/netNMFsc_tsne') + os.system('mkdir -p %s'%(args.direc)) + np.save(os.path.join(args.direc,'W.npy'),W) + np.save(os.path.join(args.direc,'H.npy'),H) + np.save(os.path.join(args.direc, 'cluster.npy'), clusters) + return +#/storage/htc/joshilab/jghhd/singlecellTest/netNMFsc/netNMF-sc/netNMFsc/refdata/ + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument("-m","--method",help="either 'GD for gradient descent or MU for multiplicative update",type=str,default='GD') + parser.add_argument("-f","--filename", help="path to data file (.npy or .mtx)",type=str,default='matrix.mtx') + parser.add_argument("-g","--gene_names", help="path to file containing gene names (.npy or .tsv)",type=str,default='gene_names.tsv') + parser.add_argument("-net","--network", help="path to network file (.npy or .mtx)",type=str,default='') + parser.add_argument("-netgenes","--netgenes", help="path to file containing gene names for network (.npy or .tsv)",type=str,default='') + parser.add_argument("-org","--organism", help="mouse or human",type=str,default='human') + parser.add_argument("-id","--idtype", help="ensemble, symbol, or entrez",type=str,default='ensemble') + parser.add_argument("-netid","--netidtype", help="ensemble, symbol, or entrez",type=str,default='entrez') + parser.add_argument("-n","--normalize", help="normalize data? 1 = yes, 0 = no",type=int,default=0) + parser.add_argument("-sparse","--sparsity", help="sparsity for network",type=float,default=0.99) + parser.add_argument("-mi","--max_iters", help="max iters for netNMF-sc",type=int,default=1500) + parser.add_argument("-t","--tol", help="tolerence for netNMF-sc",type=float,default=1e-2) + parser.add_argument("-d","--direc", help="directory to save files",default='') + parser.add_argument("-D","--dimensions", help="number of dimensions to apply shift",type=int,default = 10) + parser.add_argument("-a","--alpha", help="lambda param for netNMF-sc",type=float,default = 1.0) + parser.add_argument("-x","--tenXdir", help="data is from 10X. Only required to provide directory containing matrix.mtx, genes.tsv, barcodes.tsv files",type=str,default = '') + args = parser.parse_args() + main(args) + + +#'/storage/htc/joshilab/jghhd/singlecellTest/Data/11.Kolodziejczyk/Use_expression.csv' diff --git a/codesfromJGandYJ/impute code/MAGIC_impute.py b/codesfromJGandYJ/impute code/MAGIC_impute.py deleted file mode 100644 index c0c1f22..0000000 --- a/codesfromJGandYJ/impute code/MAGIC_impute.py +++ /dev/null @@ -1,82 +0,0 @@ -# Analysis using MAGIC method -import magic -import pandas as pd -import matplotlib.pyplot as plt -import numpy as np -import argparse -import sys -sys.path.append('../') -sys.path.append('/storage/hpc/scratch/yjiang/SCwangjuexin/scGNN-master_021720/scGNN-master/') -#from benchmark_util import impute_dropout - -def impute_dropout(X, rate=0.1): - """ - X: original testing set - ======== - returns: - X_zero: copy of X with zeros - i, j, ix: indices of where dropout is applied - """ - #If the input is a dense matrix - if isinstance(X, np.ndarray): - X_zero = np.copy(X) - # select non-zero subset - i,j = np.nonzero(X_zero) - # If the input is a sparse matrix - else: - X_zero = scipy.sparse.lil_matrix.copy(X) - # select non-zero subset - i,j = X_zero.nonzero() - # choice number 1 : select 10 percent of the non zero values (so that distributions overlap enough) - ix = np.random.choice(range(len(i)), int(np.floor(0.1 * len(i))), replace=False) - X_zero[i[ix], j[ix]] *= np.random.binomial(1, rate) - # choice number 2, focus on a few but corrupt binomially - #ix = np.random.choice(range(len(i)), int(slice_prop * np.floor(len(i))), replace=False) - #X_zero[i[ix], j[ix]] = np.random.binomial(X_zero[i[ix], j[ix]].astype(np.int), rate) - return X_zero, i, j, ix - -parser = argparse.ArgumentParser(description='') -parser.add_argument('--data', type=str, default='data1',help='data1,2,3') -parser.add_argument('--datasetName', type=str, default='MMPbasal_2000',help='MMPbasal_2000') -parser.add_argument('--discreteTag', action='store_true', default=False, - help='whether input is raw or 0/1 (default: False)') -parser.add_argument('--ratio', type=str, default='0.1', - help='dropoutratio') -args = parser.parse_args() - - -# x = np.concatenate([np.random.uniform(-3, -2, (1000, 40)), np.random.uniform(2, 3, (1000, 40))], axis=0) -if args.discreteTag: - filename = '/storage/hpc/scratch/yjiang/SCwangjuexin/scData/{}/{}.features.D.csv'.format(args.datasetName,args.datasetName) -else: - filename = '/storage/hpc/scratch/yjiang/SCwangjuexin/scGNN-master_021720/{}/{}_LTMG_0.1_features.npy'.format(args.data,args.datasetName) -x = np.load(filename,allow_pickle=True) -x = x.tolist() -x=x.todense() -x=np.asarray(x) -x=np.log(x+1) - -# Load single-cell RNA-seq data -# Default is KNN=5 -magic_operator = magic.MAGIC() -# magic_operator = magic.MAGIC(knn=10) -X_magic = magic_operator.fit_transform(x, genes="all_genes") -recon = X_magic - -discreteStr = '' -if args.discreteTag: - discreteStr = 'D' -datasetNameStr = args.datasetName+discreteStr - -np.save('/storage/hpc/scratch/yjiang/SCwangjuexin/scGNN-master_021720/magic/{}/{}_{}_recon.npy'.format(args.data,datasetNameStr,args.ratio),recon) - - -# From scVI -# # Load single-cell RNA-seq data -# scdata = magic.mg.SCData(x, "sc-seq") -# print(scdata) - -# scdata.run_magic(n_pca_components=20, random_pca=True, t=6, k=30, ka=10, epsilon=1, rescale_percent=99) - -# if len(sys.argv) == 2: -# np.save("t_MAGIC.npy", scdata.magic.data.as_matrix()) diff --git a/codesfromJGandYJ/impute code/SAVER_impute.py b/codesfromJGandYJ/impute code/SAVER_impute.py deleted file mode 100644 index 5d32405..0000000 --- a/codesfromJGandYJ/impute code/SAVER_impute.py +++ /dev/null @@ -1,56 +0,0 @@ -import os -import numpy as np -import pandas as pd -import matplotlib.pyplot as plt -import csv -import argparse -import sys -sys.path.append('../') -sys.path.append('/storage/hpc/scratch/yjiang/SCwangjuexin/scGNN-master_021720/scGNN-master/') -from benchmark_util import impute_dropout - - -parser = argparse.ArgumentParser(description='') -parser.add_argument('--data', type=str, default='data1',help='data1,2,3') -parser.add_argument('--datasetName', type=str, default='MMPbasal_2000',help='MMPbasal_2000') -parser.add_argument('--discreteTag', action='store_true', default=False, - help='whether input is raw or 0/1 (default: False)') -parser.add_argument('--ratio', type=str, default='0.1', - help='dropoutratio') -parser.add_argument('--outfolder', type=str, default='/storage/hpc/scratch/yjiang/SCwangjuexin/scGNN-master_021720/scGNN-master/otherresults/saver/', - help='output filefolder') -args = parser.parse_args() - -# Ref: -# https://nbviewer.jupyter.org/github/YosefLab/scVI/blob/master/tests/notebooks/data_loading.ipynb - -if args.discreteTag: - filename = '/storage/hpc/scratch/yjiang/SCwangjuexin/scData/{}/{}.features.D.csv'.format(args.datasetName,args.datasetName) -else: - filename = '/storage/hpc/scratch/yjiang/SCwangjuexin/scGNN-master_021720/{}/{}_LTMG_0.1_features.npy'.format(args.data,args.datasetName) -x = np.load(filename,allow_pickle=True) -x = x.tolist() -x=x.todense() -x=np.asarray(x) -x=np.log(x+1) -filenameFull = filename -save_path = '/storage/hpc/scratch/yjiang/SCwangjuexin/scGNN-master_021720/saver/{}/'.format(args.data) - -discreteStr = '' -if args.discreteTag: - discreteStr = 'D' -datasetNameStr = args.datasetName+discreteStr - -features=x - - - -#write -dropout_filename = save_path+datasetNameStr+"_dropout.csv" -with open(dropout_filename, "w") as f: - writer = csv.writer(f) - writer.writerows(features) - - - - diff --git a/codesfromJGandYJ/impute code/SCIMPUTE.py b/codesfromJGandYJ/impute code/SCIMPUTE.py deleted file mode 100644 index 246239d..0000000 --- a/codesfromJGandYJ/impute code/SCIMPUTE.py +++ /dev/null @@ -1,56 +0,0 @@ -import os -import numpy as np -import pandas as pd -import matplotlib.pyplot as plt -import csv -import argparse -import sys -sys.path.append('../') -sys.path.append('/storage/hpc/scratch/yjiang/SCwangjuexin/scGNN-master_021720/scGNN-master/') -from benchmark_util import impute_dropout - - -parser = argparse.ArgumentParser(description='') -parser.add_argument('--data', type=str, default='data1',help='data1,2,3') -parser.add_argument('--datasetName', type=str, default='MMPbasal_2000',help='MMPbasal_2000') -parser.add_argument('--discreteTag', action='store_true', default=False, - help='whether input is raw or 0/1 (default: False)') -parser.add_argument('--ratio', type=str, default='0.1', - help='dropoutratio') -parser.add_argument('--outfolder', type=str, default='/storage/hpc/scratch/yjiang/SCwangjuexin/scGNN-master_021720/scGNN-master/otherresults/saver/', - help='output filefolder') -args = parser.parse_args() - -# Ref: -# https://nbviewer.jupyter.org/github/YosefLab/scVI/blob/master/tests/notebooks/data_loading.ipynb - -if args.discreteTag: - filename = '/storage/hpc/scratch/yjiang/SCwangjuexin/scData/{}/{}.features.D.csv'.format(args.datasetName,args.datasetName) -else: - filename = '/storage/hpc/scratch/yjiang/SCwangjuexin/scGNN-master_021720/{}/{}_LTMG_0.1_features.npy'.format(args.data,args.datasetName) -x = np.load(filename,allow_pickle=True) -x = x.tolist() -x=x.todense() -x=np.asarray(x) -x=np.log(x+1) -filenameFull = filename -save_path = '/storage/hpc/scratch/yjiang/SCwangjuexin/scGNN-master_021720/scimpute/{}/'.format(args.data) - -discreteStr = '' -if args.discreteTag: - discreteStr = 'D' -datasetNameStr = args.datasetName+discreteStr - -features=x - - - -#write -dropout_filename = save_path+datasetNameStr+"_dropout.csv" -with open(dropout_filename, "w") as f: - writer = csv.writer(f) - writer.writerows(features) - - - - diff --git a/codesfromJGandYJ/impute code/dca_impute.py b/codesfromJGandYJ/impute code/dca_impute.py deleted file mode 100644 index 0496364..0000000 --- a/codesfromJGandYJ/impute code/dca_impute.py +++ /dev/null @@ -1,79 +0,0 @@ -#from dca.api import dca -#import anndata -#import matplotlib.pyplot as plt -#import numpy as np -#import time -#import pandas as pd - -#Ref: -# https://github.com/theislab/dca/blob/master/tutorial.ipynb -#z = pd.read_csv('/home/wangjue/biodata/scData/MMPbasal.csv') -#z = z.to_numpy() -#z = z[:,:-1] - -#selected = np.std(z, axis=0).argsort()[-2000:][::-1] -#expression_data = z[:, selected] - -#train = anndata.AnnData(expression_data) -#res = dca(train, verbose=True) -#train.X - -import os -import numpy as np -import pandas as pd -import matplotlib.pyplot as plt -import torch -import csv -import argparse -import sys -sys.path.append('../') -sys.path.append('/storage/hpc/scratch/yjiang/SCwangjuexin/scGNN-master_021720/scGNN-master/') -from benchmark_util import impute_dropout - -parser = argparse.ArgumentParser(description='') -parser.add_argument('--data', type=str, default='data1',help='data1,2,3') -parser.add_argument('--datasetName', type=str, default='MMPbasal_2000',help='MMPbasal_2000') -parser.add_argument('--discreteTag', action='store_true', default=False, - help='whether input is raw or 0/1 (default: False)') -parser.add_argument('--ratio', type=str, default='0.1', - help='dropoutratio') -parser.add_argument('--outfolder', type=str, default='/storage/hpc/scratch/yjiang/SCwangjuexin/scGNN-master_021720/scGNN-master/otherresults/dca/', - help='output filefolder') -args = parser.parse_args() - - -if args.discreteTag: - filename = '/storage/hpc/scratch/yjiang/SCwangjuexin/scData/{}/{}.features.D.csv'.format(args.datasetName,args.datasetName) -else: - filename = '/storage/hpc/scratch/yjiang/SCwangjuexin/scGNN-master_021720/{}/{}_LTMG_0.1_features.npy'.format(args.data,args.datasetName) -x = np.load(filename,allow_pickle=True) -x = x.tolist() -x=x.todense() -x=np.asarray(x) -filenameFull = filename -save_path = '/storage/hpc/scratch/yjiang/SCwangjuexin/scGNN-master_021720/dca/{}/'.format(args.data) - -discreteStr = '' -if args.discreteTag: - discreteStr = 'D' -datasetNameStr = args.datasetName+discreteStr - - - -features=x.T - -#write -dropout_filename = save_path+datasetNameStr+"_dropout.csv" -with open(dropout_filename, "w") as f: - writer = csv.writer(f) - writer.writerows(features) - - - -os.system("dca "+dropout_filename+ " "+save_path+datasetNameStr) - -filename=save_path+datasetNameStr+"/mean.tsv" -imputed_values = pd.read_csv(filename,sep="\t") -imputed_values=imputed_values.T - -np.save('/storage/hpc/scratch/yjiang/SCwangjuexin/scGNN-master_021720/dca/{}/{}_{}_recon.npy'.format(args.data,datasetNameStr,args.ratio),imputed_values) \ No newline at end of file diff --git a/codesfromJGandYJ/impute code/deepimpute_impute.py b/codesfromJGandYJ/impute code/deepimpute_impute.py deleted file mode 100644 index 6c31962..0000000 --- a/codesfromJGandYJ/impute code/deepimpute_impute.py +++ /dev/null @@ -1,59 +0,0 @@ -import os -import numpy as np -import pandas as pd -import matplotlib.pyplot as plt -from deepimpute.multinet import MultiNet -import torch -import csv -import argparse -import sys -sys.path.append('../') -sys.path.append('/storage/hpc/scratch/yjiang/SCwangjuexin/scGNN-master_021720/scGNN-master/') -from benchmark_util import impute_dropout - - -parser = argparse.ArgumentParser(description='') -parser.add_argument('--data', type=str, default='data1',help='data1,2,3') -parser.add_argument('--datasetName', type=str, default='MMPbasal_2000',help='MMPbasal_2000') -parser.add_argument('--discreteTag', action='store_true', default=False, - help='whether input is raw or 0/1 (default: False)') -parser.add_argument('--ratio', type=str, default='0.1', - help='dropoutratio') -parser.add_argument('--outfolder', type=str, default='/storage/hpc/scratch/yjiang/SCwangjuexin/scGNN-master_021720/scGNN-master/otherresults/deepimpute/', - help='output filefolder') -args = parser.parse_args() - -# Ref: -# https://nbviewer.jupyter.org/github/YosefLab/scVI/blob/master/tests/notebooks/data_loading.ipynb - -if args.discreteTag: - filename = '/storage/hpc/scratch/yjiang/SCwangjuexin/scData/{}/{}.features.D.csv'.format(args.datasetName,args.datasetName) -else: - filename = '/storage/hpc/scratch/yjiang/SCwangjuexin/scGNN-master_021720/{}/{}_LTMG_0.1_features.npy'.format(args.data,args.datasetName) -x = np.load(filename,allow_pickle=True) -x = x.tolist() -x=x.todense() -x=np.asarray(x) -#x=np.log(x+1) -filenameFull = filename -save_path = '/storage/hpc/scratch/yjiang/SCwangjuexin/scGNN-master_021720/deepimpute_nolog/{}/'.format(args.data) - -discreteStr = '' -if args.discreteTag: - discreteStr = 'D' -datasetNameStr = args.datasetName+discreteStr - -features=x -dropout_filename = save_path+datasetNameStr+"_dropout.csv" -with open(dropout_filename, "w") as f: - writer = csv.writer(f) - writer.writerows(features) - -data = pd.read_csv(dropout_filename, header=None) -model = MultiNet() -model.fit(data) -imputed = model.predict(data) - - -np.save(save_path+'{}_{}_recon.npy'.format(datasetNameStr,args.ratio),imputed) - diff --git a/codesfromJGandYJ/impute code/saucie_impute_t.py b/codesfromJGandYJ/impute code/saucie_impute_t.py deleted file mode 100644 index 5831c63..0000000 --- a/codesfromJGandYJ/impute code/saucie_impute_t.py +++ /dev/null @@ -1,55 +0,0 @@ -import sys -import tensorflow as tf -sys.path.append('/storage/hpc/scratch/yjiang/SCwangjuexin/scGNN-master_021720/scGNN-master/otherresults/SAUCIE-master/SAUCIE-master/') -from model import SAUCIE -from loader import Loader -import numpy as np -import matplotlib.pyplot as plt -import pandas as pd -import argparse -sys.path.append('../') -sys.path.append('/storage/hpc/scratch/yjiang/SCwangjuexin/scGNN-master_021720/scGNN-master/') -from benchmark_util import impute_dropout - -parser = argparse.ArgumentParser(description='') -parser.add_argument('--data', type=str, default='data1',help='data1,2,3') -parser.add_argument('--datasetName', type=str, default='MMPbasal',help='MMPbasal_2000') -parser.add_argument('--discreteTag', action='store_true', default=False, - help='whether input is raw or 0/1 (default: False)') -parser.add_argument('--ratio', type=str, default='0.1', - help='dropoutratio') -args = parser.parse_args() - -# x = np.concatenate([np.random.uniform(-3, -2, (1000, 40)), np.random.uniform(2, 3, (1000, 40))], axis=0) -if args.discreteTag: - filename = '/storage/hpc/scratch/yjiang/SCwangjuexin/scData/{}/{}.features.D.csv'.format(args.datasetName,args.datasetName) -else: - filename = '/storage/hpc/scratch/yjiang/SCwangjuexin/scGNN-master_021720/{}/{}_LTMG_0.1_features.npy'.format(args.data,args.datasetName) -x = np.load(filename,allow_pickle=True) -x = x.tolist() -x=x.todense() -x=np.asarray(x) -x=np.log(x+1) - -x=np.transpose(x) - -saucie = SAUCIE(x.shape[1]) -loadtrain = Loader(x, shuffle=True) -saucie.train(loadtrain, steps=1000) - -loadeval = Loader(x, shuffle=False) -reconstruction = saucie.get_reconstruction(loadeval) - -reconstruction=np.transpose(reconstruction) - -discreteStr = '' -if args.discreteTag: - discreteStr = 'D' -datasetNameStr = args.datasetName+discreteStr - -# l1ErrorMean, l1ErrorMedian, l1ErrorMin, l1ErrorMax = imputation_error(recon, featuresOriginal, None, dropi, dropj, dropix) -# print('{:.4f} {:.4f} {:.4f} {:.4f} '.format(l1ErrorMean, l1ErrorMedian, l1ErrorMin, l1ErrorMax), end='') - -np.save('/storage/hpc/scratch/yjiang/SCwangjuexin/scGNN-master_021720/saucie_t/{}/{}_{}_recon.npy'.format(args.data,datasetNameStr,args.ratio),reconstruction) - - diff --git a/codesfromJGandYJ/impute code/scVi_impute.py b/codesfromJGandYJ/impute code/scVi_impute.py deleted file mode 100644 index 6ce9383..0000000 --- a/codesfromJGandYJ/impute code/scVi_impute.py +++ /dev/null @@ -1,102 +0,0 @@ -import os -import numpy as np -import pandas as pd -import matplotlib.pyplot as plt -from scvi.dataset import CortexDataset, RetinaDataset, CsvDataset -from scvi.models import VAE -from scvi.inference import UnsupervisedTrainer -import torch -import csv -import argparse -import sys -sys.path.append('../') -sys.path.append('/storage/hpc/scratch/yjiang/SCwangjuexin/scGNN-master_021720/scGNN-master/') -from benchmark_util import impute_dropout - - -parser = argparse.ArgumentParser(description='') -parser.add_argument('--data', type=str, default='data1',help='data1,2,3') -parser.add_argument('--datasetName', type=str, default='MMPbasal_2000',help='MMPbasal_2000') -parser.add_argument('--discreteTag', action='store_true', default=False, - help='whether input is raw or 0/1 (default: False)') -parser.add_argument('--ratio', type=str, default='0.1', - help='dropoutratio') -parser.add_argument('--outfolder', type=str, default='/storage/hpc/scratch/yjiang/SCwangjuexin/scGNN-master_021720/scvi/', - help='output filefolder') -args = parser.parse_args() - -# Ref: -# https://nbviewer.jupyter.org/github/YosefLab/scVI/blob/master/tests/notebooks/data_loading.ipynb - -if args.discreteTag: - filename = '/storage/hpc/scratch/yjiang/SCwangjuexin/scData/{}/{}.features.D.csv'.format(args.datasetName,args.datasetName) -else: - filename = '/storage/hpc/scratch/yjiang/SCwangjuexin/scGNN-master_021720/{}/{}_LTMG_0.1_features.npy'.format(args.data,args.datasetName) -filenameFull = filename -save_path = '/storage/hpc/scratch/yjiang/SCwangjuexin/scGNN-master_021720/scvi/{}/'.format(args.data) - -discreteStr = '' -if args.discreteTag: - discreteStr = 'D' -datasetNameStr = args.datasetName+discreteStr - -x = np.load(filename,allow_pickle=True) -x = x.tolist() -x=x.todense() -x=np.asarray(x) -x=np.log(x+1) - - -featuresOriginal = np.copy(x) -features, dropi, dropj, dropix = impute_dropout(featuresOriginal, rate=float(args.ratio)) - -#transpose and add names for rows and cols -features=np.transpose(features) -rowname=np.linspace(1,features.shape[0],features.shape[0]).reshape([features.shape[0],1]) -features=np.concatenate([rowname,features],axis=1) -colname=np.linspace(1,features.shape[1],features.shape[1]).reshape([1,features.shape[1]]) -features=np.concatenate([colname,features],axis=0) - -#write -dropout_filename = save_path+datasetNameStr+"_dropout.csv" -with open(dropout_filename, "w") as f: - writer = csv.writer(f) - writer.writerows(features) - -# gene_dataset = CortexDataset(save_path=save_path, total_genes=558) -gene_dataset = CsvDataset(dropout_filename, save_path=save_path+args.data+"/") - -n_epochs = 400 -lr = 1e-3 -use_batches = False -use_cuda = True - -vae = VAE(gene_dataset.nb_genes, n_batch=gene_dataset.n_batches * use_batches) -trainer = UnsupervisedTrainer( - vae, - gene_dataset, - train_size=0.75, - use_cuda=use_cuda, - frequency=5, -) - -trainer.train(n_epochs=n_epochs, lr=lr) - - -full = trainer.create_posterior(trainer.model, gene_dataset, indices=np.arange(len(gene_dataset))) -latent, batch_indices, labels = full.sequential().get_latent() -batch_indices = batch_indices.ravel() - -# use imputation -imputed_values = full.sequential().imputation() -normalized_values = full.sequential().get_sample_scale() - -np.save(save_path+'{}_{}_recon.npy'.format(datasetNameStr,args.ratio),imputed_values) -np.save(save_path+'{}_{}_recon_normalized.npy'.format(datasetNameStr,args.ratio),normalized_values) -np.save(save_path+'{}_{}_featuresOriginal.npy'.format(datasetNameStr,args.ratio),featuresOriginal) -np.save(save_path+'{}_{}_dropi.npy'.format(datasetNameStr,args.ratio),dropi) -np.save(save_path+'{}_{}_dropj.npy'.format(datasetNameStr,args.ratio),dropj) -np.save(save_path+'{}_{}_dropix.npy'.format(datasetNameStr,args.ratio),dropix) - -# celltype: -#np.save(save_path+'{}_{}_z.npy'.format(datasetNameStr,args.ratio),latent) diff --git a/codesfromJGandYJ/impute/MAGIC_impute.py b/codesfromJGandYJ/impute/MAGIC_impute.py new file mode 100644 index 0000000..95fe325 --- /dev/null +++ b/codesfromJGandYJ/impute/MAGIC_impute.py @@ -0,0 +1,53 @@ +# Analysis using MAGIC method +import magic +import pandas as pd +import matplotlib.pyplot as plt +import numpy as np +import argparse +import sys +#from benchmark_util import impute_dropout + +parser = argparse.ArgumentParser(description='MAGIC Impute') +parser.add_argument('--origin', action='store_true', default=False, help='Whether use origin (default: use ratio 0.0)') +args = parser.parse_args() + + +def impute_Magic(seed=1, datasetName='9.Chung', ratio=0.1): + filename = '/storage/htc/joshilab/wangjue/scGNN/npyImputeG2E_{}/{}_LTMG_{}_10-0.1-0.9-0.0-0.3-0.1_features.npy'.format(seed, datasetName, ratio) + x = np.load(filename,allow_pickle=True) + x = x.tolist() + x=x.todense() + x=np.asarray(x) + x=np.log(x+1) + + # Load single-cell RNA-seq data + # Default is KNN=5 + magic_operator = magic.MAGIC() + # magic_operator = magic.MAGIC(knn=10) + X_magic = magic_operator.fit_transform(x, genes="all_genes") + recon = X_magic + + np.save('/storage/htc/joshilab/wangjue/scGNN/magic/{}_{}_{}_recon.npy'.format(datasetName,ratio,seed),recon) + +datasetNameList = ['9.Chung','11.Kolodziejczyk','12.Klein','13.Zeisel'] +seedList = ['1','2','3'] +ratioList = [0.1, 0.3, 0.6, 0.8] + +if args.origin: + for datasetName in datasetNameList: + impute_Magic(seed='1', datasetName=datasetName, ratio='0.0') +else: + for datasetName in datasetNameList: + for seed in seedList: + for ratio in ratioList: + impute_Magic(seed=seed, datasetName=datasetName, ratio=ratio) + +# From scVI +# # Load single-cell RNA-seq data +# scdata = magic.mg.SCData(x, "sc-seq") +# print(scdata) + +# scdata.run_magic(n_pca_components=20, random_pca=True, t=6, k=30, ka=10, epsilon=1, rescale_percent=99) + +# if len(sys.argv) == 2: +# np.save("t_MAGIC.npy", scdata.magic.data.as_matrix()) diff --git a/codesfromJGandYJ/impute/Run_netNMF_imputation.py b/codesfromJGandYJ/impute/Run_netNMF_imputation.py new file mode 100644 index 0000000..8c74b72 --- /dev/null +++ b/codesfromJGandYJ/impute/Run_netNMF_imputation.py @@ -0,0 +1,87 @@ +# This code has not cleaned yet +# run netNMF-sc from command line and save outputs to specified directory +from __future__ import print_function +import numpy as np +from warnings import warn +from joblib import Parallel, delayed +import copy,argparse,os,math,random,time +from scipy import sparse, io,linalg +from scipy.sparse import csr_matrix +import warnings,os +from netNMFsc import plot +warnings.simplefilter(action='ignore', category=FutureWarning) +import pandas as pd + +def main(args): + if args.method == 'GD': + from netNMFsc import netNMFGD + operator = netNMFGD(d=args.dimensions, alpha=args.alpha, n_inits=1, tol=args.tol, max_iter=args.max_iters, n_jobs=4) + elif args.method == 'MU': + from netNMFsc import netNMFMU + operator = netNMFMU(d=args.dimensions, alpha=args.alpha, n_inits=1, tol=args.tol, max_iter=args.max_iters, n_jobs=4) + + filename = '/storage/hpc/group/joshilab/scGNNdata/{}/{}_LTMG_{}_10-0.1-0.9-0.0-0.3-0.1_features.npy'.format( + args.Randomdata, args.datasetName,args.dropratio) + x = np.load(filename, allow_pickle=True) + x = x.tolist() + x = x.todense() + x = np.asarray(x) + if args.process == 'log': + x = np.log(x + 1) + + # transpose and add names for rows and cols + features = np.transpose(x) + + chung = pd.read_csv(args.filename, header=0, + index_col=0, sep=',') + X = features + genes = [] + for gen in chung.index.values: + if '.' in gen: + genes.append(gen.upper().split('.')[0]) + else: + genes.append(gen.upper()) + #print(genes) + operator.genes = np.asarray(genes) + operator.X = X + #operator.load_10X(direc=args.tenXdir,genome='mm10') + operator.load_network(net=args.network,genenames=args.netgenes,sparsity=args.sparsity) + dictW = operator.fit_transform() + W, H = dictW['W'], dictW['H'] + # k,clusters = plot.select_clusters(H,max_clusters=20) + # plot.tSNE(H,clusters,fname=args.direc+ '/netNMFsc_tsne_imputation_' +args.process +'_'+args.Randomdata) + # os.system('mkdir -p %s'%(args.direc)) + np.save(os.path.join(args.direc,args.Randomdata+'_'+args.process+'_imputation.npy'),np.dot(W,H)) + #np.save(os.path.join(args.direc,'H.npy'),H) + #np.save(os.path.join(args.direc, 'cluster.npy'), H) + return +#/storage/htc/joshilab/jghhd/singlecellTest/netNMFsc/netNMF-sc/netNMFsc/refdata/ + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument("-m","--method",help="either 'GD for gradient descent or MU for multiplicative update",type=str,default='GD') + parser.add_argument("-f","--filename", help="path to data file (.npy or .mtx)",type=str,default='matrix.mtx') + parser.add_argument("-g","--gene_names", help="path to file containing gene names (.npy or .tsv)",type=str,default='gene_names.tsv') + parser.add_argument("-net","--network", help="path to network file (.npy or .mtx)",type=str,default='') + parser.add_argument("-netgenes","--netgenes", help="path to file containing gene names for network (.npy or .tsv)",type=str,default='') + parser.add_argument("-org","--organism", help="mouse or human",type=str,default='human') + parser.add_argument("-id","--idtype", help="ensemble, symbol, or entrez",type=str,default='ensemble') + parser.add_argument("-netid","--netidtype", help="ensemble, symbol, or entrez",type=str,default='entrez') + parser.add_argument("-n","--normalize", help="normalize data? 1 = yes, 0 = no",type=int,default=0) + parser.add_argument("-sparse","--sparsity", help="sparsity for network",type=float,default=0.99) + parser.add_argument("-mi","--max_iters", help="max iters for netNMF-sc",type=int,default=1500) + parser.add_argument("-t","--tol", help="tolerence for netNMF-sc",type=float,default=1e-2) + parser.add_argument("-d","--direc", help="directory to save files",default='') + parser.add_argument("-D","--dimensions", help="number of dimensions to apply shift",type=int,default = 10) + parser.add_argument("-a","--alpha", help="lambda param for netNMF-sc",type=float,default = 1.0) + parser.add_argument("-x","--tenXdir", help="data is from 10X. Only required to provide directory containing matrix.mtx, genes.tsv, barcodes.tsv files",type=str,default = '') + parser.add_argument('--Randomdata', type=str, default='npyImputeG2E_1', help='npyImputeG2E_1,2,3') + parser.add_argument('--datasetName', type=str, default='12.Klein', help='12.Klein,13.Zeisel') + parser.add_argument('--process', type=str, default='null', help='log/null to process data') + parser.add_argument("-Hasdot","--Hasdot",type = bool, help="data gene names has dot",default = True) + parser.add_argument('--dropratio', type=str, default='0.1', help='0.1,0.3,0.6,0.8') + args = parser.parse_args() + main(args) + + +#'/storage/htc/joshilab/jghhd/singlecellTest/Data/11.Kolodziejczyk/Use_expression.csv' diff --git a/codesfromJGandYJ/impute/SAUCIE_impute.py b/codesfromJGandYJ/impute/SAUCIE_impute.py new file mode 100644 index 0000000..a4b0b14 --- /dev/null +++ b/codesfromJGandYJ/impute/SAUCIE_impute.py @@ -0,0 +1,74 @@ +import sys +sys.path.append("/storage/htc/joshilab/wangjue/") +import SAUCIE +import tensorflow as tf +import numpy as np +import matplotlib.pyplot as plt +import pandas as pd +import argparse + +# modified from official tutorial: https://colab.research.google.com/github/KrishnaswamyLab/SingleCellWorkshop/blob/master/exercises/Deep_Learning/notebooks/02_Answers_Exploratory_analysis_of_single_cell_data_with_SAUCIE.ipynb +# Notes: Have to use very old tensorflow downloaded from conda: +# python==3.6.12 +# tensorflow==1.4.0 +# numpy==1.19.4 + +parser = argparse.ArgumentParser(description='Impute use SAUCIE') +parser.add_argument('--origin', action='store_true', default=False, help='Whether use origin (default: use ratio 0.0)') +args = parser.parse_args() + +def impute_saucie(seed=1, datasetName='9.Chung', ratio=0.1): + filename = '/storage/htc/joshilab/wangjue/scGNN/npyImputeG2E_{}/{}_LTMG_{}_10-0.1-0.9-0.0-0.3-0.1_features.npy'.format(seed, datasetName, ratio) + x = np.load(filename,allow_pickle=True) + x = x.tolist() + x=x.todense() + x=np.asarray(x) + x=np.log(x+1) + x=np.transpose(x) + loader_train = SAUCIE.Loader(x, shuffle=True) + loader_eval = SAUCIE.Loader(x, shuffle=False) + # clear the computational graph + tf.reset_default_graph() + # build the SAUCIE model + model = SAUCIE.SAUCIE(x.shape[1]) + # train the model! + model.train(loader_train, steps=2000) + #imputation + reconstruction = model.get_reconstruction(loader_eval) + reconstruction=np.transpose(reconstruction) + np.save('/storage/htc/joshilab/wangjue/scGNN/saucie/{}_{}_{}_recon.npy'.format(datasetName,ratio,seed),reconstruction) + +def plot_saucie(seed=1, datasetName='9.Chung', ratio=0.1): + filename = '/storage/htc/joshilab/wangjue/scGNN/npyImputeG2E_{}/{}_LTMG_{}_10-0.1-0.9-0.0-0.3-0.1_features.npy'.format(seed, datasetName, ratio) + x = np.load(filename,allow_pickle=True) + x = x.tolist() + x=x.todense() + x=np.asarray(x) + x=np.log(x+1) + loader_eval = SAUCIE.Loader(x, shuffle=False) + # clear the computational graph + #plot + tf.reset_default_graph() + model = SAUCIE.SAUCIE(x.shape[1]) + model.train(loader_eval, steps=2000) + embedding = model.get_embedding(loader_eval) + num_clusters, clusters = model.get_clusters(loader_eval) + fig = plt.figure() + ax = fig.add_subplot(1, 1, 1) + ax.scatter(embedding[:, 0], embedding[:, 1], c=clusters) + fig.savefig('saucie_'+datasetName+'.png') + +datasetNameList = ['9.Chung','11.Kolodziejczyk','12.Klein','13.Zeisel'] +seedList = ['1','2','3'] +ratioList = [0.1, 0.3, 0.6, 0.8] + +if args.origin: + for datasetName in datasetNameList: + impute_saucie(seed='1', datasetName=datasetName, ratio='0.0') +else: + for datasetName in datasetNameList: + for seed in seedList: + for ratio in ratioList: + impute_saucie(seed=seed, datasetName=datasetName, ratio=ratio) + +# plot_saucie(seed='1', datasetName=datasetName, ratio='0.0') \ No newline at end of file diff --git a/codesfromJGandYJ/impute/SAVER_impute.py b/codesfromJGandYJ/impute/SAVER_impute.py new file mode 100644 index 0000000..f0f7381 --- /dev/null +++ b/codesfromJGandYJ/impute/SAVER_impute.py @@ -0,0 +1,56 @@ +import os +import numpy as np +import pandas as pd +import matplotlib.pyplot as plt +import csv +import argparse +import sys + +# Ref: +# https://github.com/mohuangx/SAVER +# https://mohuangx.github.io/SAVER/articles/saver-tutorial.html +# Use python to generate input for saver.r, then output + +parser = argparse.ArgumentParser(description='Impute SAVER') +parser.add_argument('--origin', action='store_true', default=False, help='Whether use origin (default: use ratio 0.0)') +args = parser.parse_args() + +save_path = '/storage/htc/joshilab/wangjue/scGNN/tmp/' + +def impute_saver(seed=1, datasetName='9.Chung', ratio=0.1): + filename = '/storage/htc/joshilab/wangjue/scGNN/npyImputeG2E_{}/{}_LTMG_{}_10-0.1-0.9-0.0-0.3-0.1_features.npy'.format(seed, datasetName, ratio) + + x = np.load(filename,allow_pickle=True) + x = x.tolist() + x=x.todense() + x=np.asarray(x) + x=np.log(x+1) + features=x.T + + #write + dropout_filename = save_path+"saver_input.csv" + with open(dropout_filename, "w") as f: + writer = csv.writer(f) + writer.writerows(features) + + #run the R script + os.system("Rscript saver.r "+save_path+"saver_input.csv "+save_path+"saver_output.csv ") + + filename=save_path+"saver_output.csv" + imputed_values = pd.read_csv(filename,sep="\t",header=None) + imputed_values=imputed_values.T + + np.save('/storage/htc/joshilab/wangjue/scGNN/saver/{}_{}_{}_recon.npy'.format(datasetName,ratio,seed),imputed_values) + +datasetNameList = ['9.Chung','11.Kolodziejczyk','12.Klein','13.Zeisel'] +seedList = ['1','2','3'] +ratioList = [0.1, 0.3, 0.6, 0.8] + +if args.origin: + for datasetName in datasetNameList: + impute_saver(seed='1', datasetName=datasetName, ratio='0.0') +else: + for datasetName in datasetNameList: + for seed in seedList: + for ratio in ratioList: + impute_saver(seed=seed, datasetName=datasetName, ratio=ratio) \ No newline at end of file diff --git a/codesfromJGandYJ/impute/SCIMPUTE_impute.py b/codesfromJGandYJ/impute/SCIMPUTE_impute.py new file mode 100644 index 0000000..879a6ca --- /dev/null +++ b/codesfromJGandYJ/impute/SCIMPUTE_impute.py @@ -0,0 +1,66 @@ +import os +import numpy as np +import pandas as pd +import matplotlib.pyplot as plt +import csv +import argparse +import sys + +# Notes in install scimpute: +# Have to add in R: +# Sys.setenv(R_REMOTES_NO_ERRORS_FROM_WARNINGS=TRUE) +# Ref: https://github.com/Vivianstats/scImpute + +parser = argparse.ArgumentParser(description='Impute scImpute') +parser.add_argument('--origin', action='store_true', default=False, help='Whether use origin (default: use ratio 0.0)') +args = parser.parse_args() + +save_path = '/storage/htc/joshilab/wangjue/scGNN/tmp/' + +def impute_scimpute(seed=1, datasetName='9.Chung', ratio=0.1): + filename = '/storage/htc/joshilab/wangjue/scGNN/npyImputeG2E_{}/{}_LTMG_{}_10-0.1-0.9-0.0-0.3-0.1_features.npy'.format(seed, datasetName, ratio) + + x = np.load(filename,allow_pickle=True) + x = x.tolist() + x=x.todense() + x=np.asarray(x) + x=np.log(x+1) + + features = np.copy(x) + + #transpose and add names for rows and cols + features=np.transpose(features) + rowname=np.linspace(1,features.shape[0],features.shape[0]).reshape([features.shape[0],1]) + features=np.concatenate([rowname,features],axis=1) + colname=np.linspace(1,features.shape[1],features.shape[1]).reshape([1,features.shape[1]]) + features=np.concatenate([colname,features],axis=0) + + features=features.T + + #write + dropout_filename = save_path+"scimpute_input.csv" + with open(dropout_filename, "w") as f: + writer = csv.writer(f) + writer.writerows(features) + + #run the R script + os.system("Rscript scimpute.r "+save_path+"scimpute_input.csv "+save_path+"tmpscimpute/") + + filename=save_path+"tmpscimpute/scimpute_count.csv" + imputed_values = pd.read_csv(filename,sep=",",index_col=0) + imputed_values = imputed_values.to_numpy() + + np.save('/storage/htc/joshilab/wangjue/scGNN/scimpute/{}_{}_{}_recon.npy'.format(datasetName,ratio,seed),imputed_values) + +datasetNameList = ['9.Chung','11.Kolodziejczyk','12.Klein','13.Zeisel'] +seedList = ['1','2','3'] +ratioList = [0.1, 0.3, 0.6, 0.8] + +if args.origin: + for datasetName in datasetNameList: + impute_scimpute(seed='1', datasetName=datasetName, ratio='0.0') +else: + for datasetName in datasetNameList: + for seed in seedList: + for ratio in ratioList: + impute_scimpute(seed=seed, datasetName=datasetName, ratio=ratio) diff --git a/codesfromJGandYJ/impute/dca_impute.py b/codesfromJGandYJ/impute/dca_impute.py new file mode 100644 index 0000000..6b7b2a2 --- /dev/null +++ b/codesfromJGandYJ/impute/dca_impute.py @@ -0,0 +1,53 @@ +import os +import numpy as np +import pandas as pd +import matplotlib.pyplot as plt +import csv +import argparse +import sys + +# Ref: https://github.com/theislab/dca +# Notes: As tensorflow comes to 2.0 version, lots of things chagned, here is the version tested in Nov.26, 2020 +# python==3.7.9 +# tensorflow==1.15.4 +# keras==2.3.1 +# theano==1.0.5 +# scanpy==1.5.1 + +parser = argparse.ArgumentParser(description='Imputation DCA') +parser.add_argument('--origin', action='store_true', default=False, help='Whether use origin (default: use ratio 0.0)') +args = parser.parse_args() + +save_path = '/storage/htc/joshilab/wangjue/scGNN/tmp/' + +def impute_dca(seed=1, datasetName='9.Chung', ratio=0.1): + filename = '/storage/htc/joshilab/wangjue/scGNN/npyImputeG2E_{}/{}_LTMG_{}_10-0.1-0.9-0.0-0.3-0.1_features.npy'.format(seed, datasetName, ratio) + x = np.load(filename,allow_pickle=True) + x = x.tolist() + x=x.todense() + x=np.asarray(x) + x=x.astype(int) + features=x.T + #write + dropout_filename = save_path+"dca_input.csv" + with open(dropout_filename, "w") as f: + writer = csv.writer(f) + writer.writerows(features) + os.system("dca "+dropout_filename+ " "+save_path+"tmpdca") + filename=save_path+"tmpdca/mean.tsv" + imputed_values = pd.read_csv(filename,sep="\t") + imputed_values=imputed_values.T + np.save('/storage/htc/joshilab/wangjue/scGNN/dca/{}_{}_{}_recon.npy'.format(datasetName,ratio,seed),imputed_values) + +datasetNameList = ['9.Chung','11.Kolodziejczyk','12.Klein','13.Zeisel'] +seedList = ['1','2','3'] +ratioList = [0.1, 0.3, 0.6, 0.8] + +if args.origin: + for datasetName in datasetNameList: + impute_dca(seed='1', datasetName=datasetName, ratio='0.0') +else: + for datasetName in datasetNameList: + for seed in seedList: + for ratio in ratioList: + impute_dca(seed=seed, datasetName=datasetName, ratio=ratio) diff --git a/codesfromJGandYJ/impute/deepimpute_impute.py b/codesfromJGandYJ/impute/deepimpute_impute.py new file mode 100644 index 0000000..9943321 --- /dev/null +++ b/codesfromJGandYJ/impute/deepimpute_impute.py @@ -0,0 +1,52 @@ +import os +import numpy as np +import pandas as pd +import matplotlib.pyplot as plt +from deepimpute.multinet import MultiNet +import torch +import csv +import argparse +import sys + +parser = argparse.ArgumentParser(description='Impute Deepimpute') +# In this script, not using arguments +parser.add_argument('--origin', action='store_true', default=False, help='Whether use origin (default: use ratio 0.0)') +args = parser.parse_args() + +# Ref: +# https://nbviewer.jupyter.org/github/YosefLab/scVI/blob/master/tests/notebooks/data_loading.ipynb +save_path = '/storage/htc/joshilab/wangjue/scGNN/tmp/' + +def impute_deepimpute(seed=1, datasetName='9.Chung', ratio=0.1): + filename = '/storage/htc/joshilab/wangjue/scGNN/npyImputeG2E_{}/{}_LTMG_{}_10-0.1-0.9-0.0-0.3-0.1_features.npy'.format(seed, datasetName, ratio) + x = np.load(filename,allow_pickle=True) + x = x.tolist() + x=x.todense() + x=np.asarray(x) + # x=np.log(x+1) + + features=x + dropout_filename = save_path+"deepimpute.csv" + with open(dropout_filename, "w") as f: + writer = csv.writer(f) + writer.writerows(features) + + data = pd.read_csv(dropout_filename, header=None) + model = MultiNet() + model.fit(data) + imputed = model.predict(data) + + np.save('/storage/htc/joshilab/wangjue/scGNN/deepimpute/{}_{}_{}_recon.npy'.format(datasetName,ratio,seed),imputed) + +datasetNameList = ['9.Chung','11.Kolodziejczyk','12.Klein','13.Zeisel'] +seedList = ['1','2','3'] +ratioList = [0.1, 0.3, 0.6, 0.8] + +if args.origin: + for datasetName in datasetNameList: + impute_deepimpute(seed='1', datasetName=datasetName, ratio='0.0') +else: + for datasetName in datasetNameList: + for seed in seedList: + for ratio in ratioList: + impute_deepimpute(seed=seed, datasetName=datasetName, ratio=ratio) diff --git a/codesfromJGandYJ/impute/other_dca.sh b/codesfromJGandYJ/impute/other_dca.sh new file mode 100644 index 0000000..f41c874 --- /dev/null +++ b/codesfromJGandYJ/impute/other_dca.sh @@ -0,0 +1,17 @@ +#! /bin/bash +######################### Batch Headers ######################### +#SBATCH -A xulab +#SBATCH -p Lewis,BioCompute # use the BioCompute partition Lewis,BioCompute +#SBATCH -J DCA +#SBATCH -o results-%j.out # give the job output a custom name +#SBATCH -t 2-00:00 # two days time limit +#SBATCH -N 1 # number of nodes +#SBATCH -n 1 # number of cores (AKA tasks) +#SBATCH --mem=128G +################################################################# + +module load miniconda3 +source activate /storage/htc/joshilab/wangjue/conda_R_dca +# grid +# python3 -W ignore dca_impute.py +python3 -W ignore dca_impute.py --origin diff --git a/codesfromJGandYJ/impute/other_deepimpute.sh b/codesfromJGandYJ/impute/other_deepimpute.sh new file mode 100644 index 0000000..23d18c9 --- /dev/null +++ b/codesfromJGandYJ/impute/other_deepimpute.sh @@ -0,0 +1,15 @@ +#! /bin/bash +######################### Batch Headers ######################### +#SBATCH -A xulab +#SBATCH -p Lewis,BioCompute # use the BioCompute partition Lewis,BioCompute +#SBATCH -J deepimpute +#SBATCH -o results-%j.out # give the job output a custom name +#SBATCH -t 2-00:00 # two days time limit +#SBATCH -N 1 # number of nodes +#SBATCH -n 1 # number of cores (AKA tasks) +#SBATCH --mem=128G +################################################################# +module load miniconda3 +source activate conda_R +# python3 -W ignore deepimpute_impute.py +python3 -W ignore deepimpute_impute.py --origin diff --git a/codesfromJGandYJ/impute/other_magic.sh b/codesfromJGandYJ/impute/other_magic.sh new file mode 100644 index 0000000..6d85905 --- /dev/null +++ b/codesfromJGandYJ/impute/other_magic.sh @@ -0,0 +1,15 @@ +#! /bin/bash +######################### Batch Headers ######################### +#SBATCH -A xulab +#SBATCH -p Lewis,BioCompute # use the BioCompute partition Lewis,BioCompute +#SBATCH -J Magic +#SBATCH -o results-%j.out # give the job output a custom name +#SBATCH -t 2-00:00 # two days time limit +#SBATCH -N 1 # number of nodes +#SBATCH -n 1 # number of cores (AKA tasks) +#SBATCH --mem=128G +################################################################# +module load miniconda3 +source activate conda_R +# python3 -W ignore MAGIC_impute.py +python3 -W ignore MAGIC_impute.py --origin diff --git a/codesfromJGandYJ/impute/other_saucie.sh b/codesfromJGandYJ/impute/other_saucie.sh new file mode 100644 index 0000000..31c8ce1 --- /dev/null +++ b/codesfromJGandYJ/impute/other_saucie.sh @@ -0,0 +1,15 @@ +#! /bin/bash +######################### Batch Headers ######################### +#SBATCH -A xulab +#SBATCH -p Lewis,BioCompute # use the BioCompute partition Lewis,BioCompute +#SBATCH -J saucie +#SBATCH -o results-%j.out # give the job output a custom name +#SBATCH -t 2-00:00 # two days time limit +#SBATCH -N 1 # number of nodes +#SBATCH -n 1 # number of cores (AKA tasks) +#SBATCH --mem=128G +################################################################# +module load miniconda3 +source activate /storage/htc/joshilab/wangjue/conda_R_saucie +# python3 -W ignore SAUCIE_impute.py +python3 -W ignore SAUCIE_impute.py --origin \ No newline at end of file diff --git a/codesfromJGandYJ/impute/other_saver.sh b/codesfromJGandYJ/impute/other_saver.sh new file mode 100644 index 0000000..2a29663 --- /dev/null +++ b/codesfromJGandYJ/impute/other_saver.sh @@ -0,0 +1,15 @@ +#! /bin/bash +######################### Batch Headers ######################### +#SBATCH -A xulab +#SBATCH -p BioCompute,Lewis # use the BioCompute partition Lewis,BioCompute +#SBATCH -J Saver +#SBATCH -o results-%j.out # give the job output a custom name +#SBATCH -t 2-00:00 # two days time limit +#SBATCH -N 1 # number of nodes +#SBATCH -n 12 # number of cores (AKA tasks) +#SBATCH --mem=128G +################################################################# +module load miniconda3 +source activate conda_R +# python3 -W ignore SAVER_impute.py +python3 -W ignore SAVER_impute.py --origin diff --git a/codesfromJGandYJ/impute/other_scimpute.sh b/codesfromJGandYJ/impute/other_scimpute.sh new file mode 100644 index 0000000..5da0040 --- /dev/null +++ b/codesfromJGandYJ/impute/other_scimpute.sh @@ -0,0 +1,15 @@ +#! /bin/bash +######################### Batch Headers ######################### +#SBATCH -A xulab +#SBATCH -p BioCompute,Lewis # use the BioCompute partition Lewis,BioCompute +#SBATCH -J scimpute +#SBATCH -o results-%j.out # give the job output a custom name +#SBATCH -t 2-00:00 # two days time limit +#SBATCH -N 1 # number of nodes +#SBATCH -n 12 # number of cores (AKA tasks) +#SBATCH --mem=128G +################################################################# +module load miniconda3 +source activate conda_R +# python3 -W ignore SCIMPUTE_impute.py +python3 -W ignore SCIMPUTE_impute.py --origin \ No newline at end of file diff --git a/codesfromJGandYJ/impute/other_scvi.sh b/codesfromJGandYJ/impute/other_scvi.sh new file mode 100644 index 0000000..7b258fa --- /dev/null +++ b/codesfromJGandYJ/impute/other_scvi.sh @@ -0,0 +1,15 @@ +#! /bin/bash +######################### Batch Headers ######################### +#SBATCH -A xulab +#SBATCH -p Lewis,BioCompute # use the BioCompute partition Lewis,BioCompute +#SBATCH -J scvi +#SBATCH -o results-%j.out # give the job output a custom name +#SBATCH -t 2-00:00 # two days time limit +#SBATCH -N 1 # number of nodes +#SBATCH -n 1 # number of cores (AKA tasks) +#SBATCH --mem=128G +################################################################# +module load miniconda3 +source activate conda_R +# python3 -W ignore scVi_impute.py +python3 -W ignore scVi_impute.py --origin \ No newline at end of file diff --git a/codesfromJGandYJ/impute/run_scIGANS_imputation.py b/codesfromJGandYJ/impute/run_scIGANS_imputation.py new file mode 100644 index 0000000..20faf7d --- /dev/null +++ b/codesfromJGandYJ/impute/run_scIGANS_imputation.py @@ -0,0 +1,51 @@ +# This code has not cleaned yet +import sys,os +import numpy as np +import pandas as pd +import argparse +sys.path.append('../') +sys.path.append('/storage/htc/joshilab/jghhd/singlecellTest/scIGAN/scIGANs/') + +parser = argparse.ArgumentParser(description='') +parser.add_argument('--Randomdata', type=str, default='npyImputeG2E_1',help='npyImputeG2E_1,2,3') +parser.add_argument('--datasetName', type=str, default='12.Klein',help='12.Klein,13.Zeisel') +parser.add_argument('--process', type=str, default='null',help='log/null to process data') +parser.add_argument('--exec', type=str, default='scIGANs',help='12.Klein') +parser.add_argument('--dropratio', type=str, default='0.1',help='0.1,0.3,0.6,0.8') +parser.add_argument('--csvsavepath', type=str, default='/storage/htc/joshilab/jghhd/singlecellTest/Data/',help='12.Klein') +parser.add_argument('--labelpath', type=str, default='/storage/htc/joshilab/jghhd/singlecellTest/Data/',help='12.Klein') +parser.add_argument('--outpath', type=str, default='/storage/htc/joshilab/jghhd/singlecellTest/scIGAN/Result_200/',help='12.Klein') +parser.add_argument('--Epotch', type=str, default='200',help='epotch') +args = parser.parse_args() + +# x = np.concatenate([np.random.uniform(-3, -2, (1000, 40)), np.random.uniform(2, 3, (1000, 40))], axis=0) + +filename = '/storage/hpc/group/joshilab/scGNNdata/{}/{}_LTMG_{}_10-0.1-0.9-0.0-0.3-0.1_features.npy'.format(args.Randomdata,args.datasetName,args.dropratio) +x = np.load(filename,allow_pickle=True) +x = x.tolist() +x=x.todense() +x=np.asarray(x) +if args.process=='log': + x=np.log(x+1) + saveintedir = '{}{}/{}_{}_LTMG_{}_10-0.1-0.9-0.0-0.3-0.1_features_log.csv'.format(args.csvsavepath, args.datasetName,args.Randomdata, + args.datasetName,args.dropratio) +elif args.process=='null': + saveintedir = '{}{}/{}_{}_LTMG_{}_10-0.1-0.9-0.0-0.3-0.1_features.csv'.format(args.csvsavepath, args.datasetName,args.Randomdata, + args.datasetName,args.dropratio) +#transpose and add names for rows and cols +features=np.transpose(x) + +pd.DataFrame(features).to_csv(saveintedir,sep='\t') + +label = '{}{}/{}_only_label.csv'.format(args.labelpath,args.datasetName,args.datasetName.split('.')[-1]) +#/storage/htc/joshilab/jghhd/singlecellTest/Data/12.Klein/Klein_only_label.csv + +cmd = '{} {} -l {} -e {} -o {}{}'.format(args.exec,saveintedir,label,args.Epotch,args.outpath,args.datasetName) +print(cmd) +os.system(cmd) +#scIGANs saveintedir -l -e 50 + +# l1ErrorMean, l1ErrorMedian, l1ErrorMin, l1ErrorMax = imputation_error(recon, featuresOriginal, None, dropi, dropj, dropix) +# print('{:.4f} {:.4f} {:.4f} {:.4f} '.format(l1ErrorMean, l1ErrorMedian, l1ErrorMin, l1ErrorMax), end='') + +#np.save('/storage/hpc/scratch/yjiang/SCwangjuexin/scGNN-master_021720/saucie_t/{}/{}_{}_recon.npy'.format(args.data,datasetNameStr,args.ratio),reconstruction) diff --git a/codesfromJGandYJ/impute/saver.r b/codesfromJGandYJ/impute/saver.r new file mode 100644 index 0000000..1b0953b --- /dev/null +++ b/codesfromJGandYJ/impute/saver.r @@ -0,0 +1,16 @@ +# Usage: +# Rscript saver.r input.txt output.txt +# test if there is one argument: if not, return an error +args = commandArgs(trailingOnly=TRUE) +if (length(args)==0) { + stop("At least one argument must be supplied (input file)\n", call.=FALSE) +} + +library(SAVER) +inputfile = args[1] +outputfile = args[2] +raw.data <- read.csv(inputfile, header = FALSE, sep=',') +expr <- as.matrix(raw.data) +# Use 12 cores in saver +expr.saver <- saver(expr, ncores = 12, estimates.only = TRUE) +write.table(expr.saver, file=outputfile, row.names = F, col.names = F, sep = "\t") \ No newline at end of file diff --git a/codesfromJGandYJ/impute/scVi_impute.py b/codesfromJGandYJ/impute/scVi_impute.py new file mode 100644 index 0000000..643204b --- /dev/null +++ b/codesfromJGandYJ/impute/scVi_impute.py @@ -0,0 +1,92 @@ +import os +import numpy as np +import pandas as pd +import matplotlib.pyplot as plt +from scvi.dataset import CortexDataset, RetinaDataset, CsvDataset +from scvi.models import VAE +from scvi.inference import UnsupervisedTrainer +import torch +import csv +import argparse +import sys + +# pip install scvi==0.6.3 +parser = argparse.ArgumentParser(description='scVi imputation') +parser.add_argument('--origin', action='store_true', default=False, help='Whether use origin (default: use ratio 0.0)') +args = parser.parse_args() + +# Ref: +# https://nbviewer.jupyter.org/github/YosefLab/scVI/blob/master/tests/notebooks/data_loading.ipynb + + +save_path = '/storage/htc/joshilab/wangjue/scGNN/tmp/' + +def impute_scvi(seed=1, datasetName='9.Chung', ratio=0.1): + filename = '/storage/htc/joshilab/wangjue/scGNN/npyImputeG2E_{}/{}_LTMG_{}_10-0.1-0.9-0.0-0.3-0.1_features.npy'.format(seed, datasetName, ratio) + + x = np.load(filename,allow_pickle=True) + x = x.tolist() + x=x.todense() + x=np.asarray(x) + x=np.log(x+1) + + features = np.copy(x) + + #transpose and add names for rows and cols + features=np.transpose(features) + rowname=np.linspace(1,features.shape[0],features.shape[0]).reshape([features.shape[0],1]) + features=np.concatenate([rowname,features],axis=1) + colname=np.linspace(1,features.shape[1],features.shape[1]).reshape([1,features.shape[1]]) + features=np.concatenate([colname,features],axis=0) + + #write + dropout_filename = save_path+"scvi.csv" + with open(dropout_filename, "w") as f: + writer = csv.writer(f) + writer.writerows(features) + + # gene_dataset = CortexDataset(save_path=save_path, total_genes=558) + gene_dataset = CsvDataset(dropout_filename, save_path=save_path) + + n_epochs = 400 + lr = 1e-3 + use_batches = False + use_cuda = False + + vae = VAE(gene_dataset.nb_genes, n_batch=gene_dataset.n_batches * use_batches) + trainer = UnsupervisedTrainer( + vae, + gene_dataset, + train_size=0.75, + use_cuda=use_cuda, + frequency=5, + ) + + trainer.train(n_epochs=n_epochs, lr=lr) + + full = trainer.create_posterior(trainer.model, gene_dataset, indices=np.arange(len(gene_dataset))) + latent, batch_indices, labels = full.sequential().get_latent() + batch_indices = batch_indices.ravel() + + # use imputation + imputed_values = full.sequential().imputation() + normalized_values = full.sequential().get_sample_scale() + + np.save('/storage/htc/joshilab/wangjue/scGNN/scvi/{}_{}_{}_recon.npy'.format(datasetName,ratio,seed),imputed_values) + np.save('/storage/htc/joshilab/wangjue/scGNN/scvi/{}_{}_{}_recon_normalized.npy'.format(datasetName,ratio,seed),normalized_values) + +datasetNameList = ['9.Chung','11.Kolodziejczyk','12.Klein','13.Zeisel'] +seedList = ['1','2','3'] +ratioList = [0.1, 0.3, 0.6, 0.8] + +if args.origin: + for datasetName in datasetNameList: + impute_scvi(seed='1', datasetName=datasetName, ratio='0.0') +else: + for datasetName in datasetNameList: + for seed in seedList: + for ratio in ratioList: + impute_scvi(seed=seed, datasetName=datasetName, ratio=ratio) + +# celltype: +#np.save(save_path+'{}_{}_z.npy'.format(datasetNameStr,args.ratio),latent) diff --git a/codesfromJGandYJ/impute/scimpute.r b/codesfromJGandYJ/impute/scimpute.r new file mode 100644 index 0000000..ec91006 --- /dev/null +++ b/codesfromJGandYJ/impute/scimpute.r @@ -0,0 +1,20 @@ +# Usage: +# Rscript scImpute.r input.txt outputdir +# test if there is one argument: if not, return an error +args = commandArgs(trailingOnly=TRUE) +if (length(args)==0) { + stop("At least one argument must be supplied (input file)\n", call.=FALSE) +} + +library(scImpute) +inputfile = args[1] +outputDir = args[2] +scimpute(# full path to raw count matrix + count_path = inputfile, + infile = "csv", # format of input file + outfile = "csv", # format of output file + out_dir = outputDir, # full path to output directory + labeled = FALSE, # cell type labels not available + drop_thre = 0.5, # threshold set on dropout probability + Kcluster = 2, # 2 cell subpopulations + ncores = 12) # number of cores used in parallel computation diff --git a/do_timer_test.sh b/do_timer_test.sh new file mode 100644 index 0000000..43c06e8 --- /dev/null +++ b/do_timer_test.sh @@ -0,0 +1,9 @@ +python3 -W ignore main_benchmark_timer.py --datasetName 9.Chung --benchmark /home/wangjue/myprojects/scGNN/data/scData/9.Chung/Chung_cell_label.csv --LTMGDir /home/wangjue/myprojects/scGNN/data/scData/ --regulized-type LTMG --EMtype celltypeEM --clustering-method LouvainK --useGAEembedding --npyDir outputDir_gpu/ --debuginfo >9gpu.txt +python3 -W ignore main_benchmark_timer.py --datasetName 11.Kolodziejczyk --benchmark /home/wangjue/myprojects/scGNN/data/scData/11.Kolodziejczyk/Kolodziejczyk_cell_label.csv --LTMGDir /home/wangjue/myprojects/scGNN/data/scData/ --regulized-type LTMG --EMtype celltypeEM --clustering-method LouvainK --useGAEembedding --npyDir outputDir_gpu/ --debuginfo >11gpu.txt +python3 -W ignore main_benchmark_timer.py --datasetName 12.Klein --benchmark /home/wangjue/myprojects/scGNN/data/scData/12.Klein/Klein_cell_label.csv --LTMGDir /home/wangjue/myprojects/scGNN/data/scData/ --regulized-type LTMG --EMtype celltypeEM --clustering-method LouvainK --useGAEembedding --npyDir outputDir_gpu/ --debuginfo >12gpu.txt +python3 -W ignore main_benchmark_timer.py --datasetName 13.Zeisel --benchmark /home/wangjue/myprojects/scGNN/data/scData/13.Zeisel/Zeisel_cell_label.csv --LTMGDir /home/wangjue/myprojects/scGNN/data/scData/ --regulized-type LTMG --EMtype celltypeEM --clustering-method LouvainK --useGAEembedding --npyDir outputDir_gpu/ --debuginfo >13gpu.txt + +python3 -W ignore main_benchmark_timer.py --datasetName 9.Chung --benchmark /home/wangjue/myprojects/scGNN/data/scData/9.Chung/Chung_cell_label.csv --LTMGDir /home/wangjue/myprojects/scGNN/data/scData/ --regulized-type LTMG --EMtype celltypeEM --clustering-method LouvainK --useGAEembedding --npyDir outputDir_cpu/ --no-cuda --debuginfo >9cpu.txt +python3 -W ignore main_benchmark_timer.py --datasetName 11.Kolodziejczyk --benchmark /home/wangjue/myprojects/scGNN/data/scData/11.Kolodziejczyk/Kolodziejczyk_cell_label.csv --LTMGDir /home/wangjue/myprojects/scGNN/data/scData/ --regulized-type LTMG --EMtype celltypeEM --clustering-method LouvainK --useGAEembedding --npyDir outputDir_cpu/ --debuginfo --no-cuda >11cpu.txt +python3 -W ignore main_benchmark_timer.py --datasetName 12.Klein --benchmark /home/wangjue/myprojects/scGNN/data/scData/12.Klein/Klein_cell_label.csv --LTMGDir /home/wangjue/myprojects/scGNN/data/scData/ --regulized-type LTMG --EMtype celltypeEM --clustering-method LouvainK --useGAEembedding --npyDir outputDir_cpu/ --debuginfo --no-cuda >12cpu.txt +python3 -W ignore main_benchmark_timer.py --datasetName 13.Zeisel --benchmark /home/wangjue/myprojects/scGNN/data/scData/13.Zeisel/Zeisel_cell_label.csv --LTMGDir /home/wangjue/myprojects/scGNN/data/scData/ --regulized-type LTMG --EMtype celltypeEM --clustering-method LouvainK --useGAEembedding --npyDir outputDir_cpu/ --debuginfo --no-cuda >13cpu.txt diff --git a/generating_Impute_0.0.py b/generating_Impute_0.0.py new file mode 100644 index 0000000..dd50d28 --- /dev/null +++ b/generating_Impute_0.0.py @@ -0,0 +1,78 @@ +import argparse + +# python generatingMethodsBatchshell_louvain.py +# python generatingMethodsBatchshell_louvain.py --imputeMode +parser = argparse.ArgumentParser(description='Generating sbatch files for HPC cluster running imputation of original scGNN ') +parser.add_argument('--outputDir', type=str, default='', + help='Directory of batch files for cluster running') +parser.add_argument('--imputeMode', action='store_true', default=True, + help='whether impute') +args = parser.parse_args() + +templateStr1 = "#! /bin/bash\n"\ +"######################### Batch Headers #########################\n"\ +"#SBATCH -A xulab\n"\ +"#SBATCH -p BioCompute,Lewis # use the BioCompute partition Lewis,BioCompute\n"\ +"#SBATCH -J " + +templateStr2 = "\n#SBATCH -o results-%j.out # give the job output a custom name\n"\ +"#SBATCH -t 2-00:00 # two days time limit\n"\ +"#SBATCH -N 1 # number of nodes\n"\ +"#SBATCH -n 1 # number of cores (AKA tasks)\n"\ +"#SBATCH --mem=128G\n"\ +"#################################################################\n"\ +"module load miniconda3\n"\ +"source activate conda_R\n" + +#tuple list +#batchInfo,scGNNparam,outDir +#huge matrix +methodsList = [ + ('run_experiment_2_g_e_1 2ge1','--regulized-type LTMG --EMtype celltypeEM --clustering-method LouvainK --useGAEembedding --seed 1 --npyDir','npyG2E_1/'), +] + +dropoutList = ['0.0',] + +# generate sbatch files: +for item in methodsList: + batchInfo,scGNNparam,outDirStr = item + tmp = batchInfo.split() + tmpstr1=tmp[0] + tmpstr2=tmp[1] + imputeStr = '' + if args.imputeMode: + tmpstr1 = tmpstr1.replace('run_experiment','run_experimentImpute') + tmpstr2 = "I"+tmpstr2 + # tmpstr2 = "I"+tmpstr2[2:] + imputeStr = ' --imputeMode ' + outDirStr = "npyImpute"+outDirStr[3:] + outputFilename = args.outputDir + tmpstr1 + abbrStr = tmpstr2 + + for dropoutPara in dropoutList: + commandLine = "python3 -W ignore main_benchmark.py --datasetName 9.Chung --benchmark /home/jwang/data/scData/9.Chung/Chung_cell_label.csv "+scGNNparam+" "+outDirStr+imputeStr+" --dropoutRatio "+dropoutPara+"\n" + outStr = templateStr1 + abbrStr + templateStr2 + commandLine + "\n" + with open(outputFilename+"_9_"+dropoutPara+".sh",'w') as fw: + fw.write(outStr) + fw.close() + + for dropoutPara in dropoutList: + commandLine = "python3 -W ignore main_benchmark.py --datasetName 11.Kolodziejczyk --benchmark /home/jwang/data/scData/11.Kolodziejczyk/Kolodziejczyk_cell_label.csv "+scGNNparam+" "+outDirStr+imputeStr+" --dropoutRatio "+dropoutPara+"\n" + outStr = templateStr1 + abbrStr + templateStr2 + commandLine + "\n" + with open(outputFilename+"_11_"+dropoutPara+".sh",'w') as fw: + fw.write(outStr) + fw.close() + + for dropoutPara in dropoutList: + commandLine = "python3 -W ignore main_benchmark.py --datasetName 12.Klein --benchmark /home/jwang/data/scData/12.Klein/Klein_cell_label.csv "+scGNNparam+" "+outDirStr+imputeStr+" --dropoutRatio "+dropoutPara+"\n" + outStr = templateStr1 + abbrStr + templateStr2 + commandLine + "\n" + with open(outputFilename+"_12_"+dropoutPara+".sh",'w') as fw: + fw.write(outStr) + fw.close() + + for dropoutPara in dropoutList: + commandLine = "python3 -W ignore main_benchmark.py --datasetName 13.Zeisel --benchmark /home/jwang/data/scData/13.Zeisel/Zeisel_cell_label.csv "+scGNNparam+" "+outDirStr+imputeStr+" --dropoutRatio "+dropoutPara+"\n" + outStr = templateStr1 + abbrStr + templateStr2 + commandLine + "\n" + with open(outputFilename+"_13_"+dropoutPara+".sh",'w') as fw: + fw.write(outStr) + fw.close() diff --git a/generating_Impute_0.1-0.8-ablation.py b/generating_Impute_0.1-0.8-ablation.py new file mode 100644 index 0000000..ce1d245 --- /dev/null +++ b/generating_Impute_0.1-0.8-ablation.py @@ -0,0 +1,94 @@ +import argparse + +# python generatingMethodsBatchshell_louvain.py +# python generatingMethodsBatchshell_louvain.py --imputeMode +parser = argparse.ArgumentParser(description='Generating sbatch files for HPC cluster running') +parser.add_argument('--outputDir', type=str, default='', + help='Directory of batch files for cluster running') +parser.add_argument('--imputeMode', action='store_true', default=True, + help='whether impute') +args = parser.parse_args() + +templateStr1 = "#! /bin/bash\n"\ +"######################### Batch Headers #########################\n"\ +"#SBATCH -A xulab\n"\ +"#SBATCH -p BioCompute,Lewis # use the BioCompute partition Lewis,BioCompute\n"\ +"#SBATCH -J " + +templateStr2 = "\n#SBATCH -o results-%j.out # give the job output a custom name\n"\ +"#SBATCH -t 2-00:00 # two days time limit\n"\ +"#SBATCH -N 1 # number of nodes\n"\ +"#SBATCH -n 1 # number of cores (AKA tasks)\n"\ +"#SBATCH --mem=128G\n"\ +"#################################################################\n"\ +"module load miniconda3\n"\ +"source activate conda_R\n" + +#tuple list +#batchInfo,scGNNparam,outDir +#huge matrix +methodsList = [ + ('run_experiment_2_g_e_L_1 2geL1','--regulized-type LTMG --EMtype celltypeEM --clustering-method LouvainK --useGAEembedding --L1Para 0.0 --seed 1 --npyDir','npyG2EL_1/'), + ('run_experiment_1_g_e_1 1ge1','--regulized-type LTMG --EMtype EM --clustering-method LouvainK --useGAEembedding --seed 1 --npyDir','npyG1E_1/'), + ('run_experiment_2_g_f_1 2gf1','--regulized-type LTMG --EMtype celltypeEM --clustering-method LouvainK --seed 1 --npyDir','npyG2F_1/'), + ('run_experiment_2_n_e_1 2ne1','--regulized-type noregu --EMtype celltypeEM --clustering-method LouvainK --useGAEembedding --seed 1 --npyDir','npyN2E_1/'), + ('run_experiment_2_g_e_1 2ge1','--regulized-type LTMG --EMtype celltypeEM --clustering-method LouvainK --useGAEembedding --seed 1 --npyDir','npyG2E_1/'), + + ('run_experiment_2_g_e_L_2 2geL2','--regulized-type LTMG --EMtype celltypeEM --clustering-method LouvainK --useGAEembedding --L1Para 0.0 --seed 2 --npyDir','npyG2EL_2/'), + ('run_experiment_1_g_e_2 1ge2','--regulized-type LTMG --EMtype EM --clustering-method LouvainK --useGAEembedding --seed 2 --npyDir','npyG1E_2/'), + ('run_experiment_2_g_f_2 2gf2','--regulized-type LTMG --EMtype celltypeEM --clustering-method LouvainK --seed 2 --npyDir','npyG2F_2/'), + ('run_experiment_2_n_e_2 2ne2','--regulized-type noregu --EMtype celltypeEM --clustering-method LouvainK --useGAEembedding --seed 2 --npyDir','npyN2E_2/'), + ('run_experiment_2_g_e_2 2ge2','--regulized-type LTMG --EMtype celltypeEM --clustering-method LouvainK --useGAEembedding --seed 2 --npyDir','npyG2E_2/'), + + ('run_experiment_2_g_e_L_3 2geL3','--regulized-type LTMG --EMtype celltypeEM --clustering-method LouvainK --useGAEembedding --L1Para 0.0 --seed 3 --npyDir','npyG2EL_3/'), + ('run_experiment_1_g_e_3 1ge3','--regulized-type LTMG --EMtype EM --clustering-method LouvainK --useGAEembedding --seed 3 --npyDir','npyG1E_3/'), + ('run_experiment_2_g_f_3 2gf3','--regulized-type LTMG --EMtype celltypeEM --clustering-method LouvainK --seed 3 --npyDir','npyG2F_3/'), + ('run_experiment_2_n_e_3 2ne3','--regulized-type noregu --EMtype celltypeEM --clustering-method LouvainK --useGAEembedding --seed 3 --npyDir','npyN2E_3/'), + ('run_experiment_2_g_e_3 2ge3','--regulized-type LTMG --EMtype celltypeEM --clustering-method LouvainK --useGAEembedding --seed 3 --npyDir','npyG2E_3/'), +] + +dropoutList = ['0.1','0.3','0.6','0.8'] + +# generate sbatch files: +for item in methodsList: + batchInfo,scGNNparam,outDirStr = item + tmp = batchInfo.split() + tmpstr1=tmp[0] + tmpstr2=tmp[1] + imputeStr = '' + if args.imputeMode: + tmpstr1 = tmpstr1.replace('run_experiment','run_experimentImpute') + tmpstr2 = "I"+tmpstr2 + # tmpstr2 = "I"+tmpstr2[2:] + imputeStr = ' --imputeMode ' + outDirStr = "npyImpute"+outDirStr[3:] + outputFilename = args.outputDir + tmpstr1 + abbrStr = tmpstr2 + + for dropoutPara in dropoutList: + commandLine = "python3 -W ignore main_benchmark.py --datasetName 9.Chung --benchmark /home/jwang/data/scData/9.Chung/Chung_cell_label.csv "+scGNNparam+" "+outDirStr+imputeStr+" --dropoutRatio "+dropoutPara+"\n" + outStr = templateStr1 + abbrStr + templateStr2 + commandLine + "\n" + with open(outputFilename+"_9_"+dropoutPara+".sh",'w') as fw: + fw.write(outStr) + fw.close() + + for dropoutPara in dropoutList: + commandLine = "python3 -W ignore main_benchmark.py --datasetName 11.Kolodziejczyk --benchmark /home/jwang/data/scData/11.Kolodziejczyk/Kolodziejczyk_cell_label.csv "+scGNNparam+" "+outDirStr+imputeStr+" --dropoutRatio "+dropoutPara+"\n" + outStr = templateStr1 + abbrStr + templateStr2 + commandLine + "\n" + with open(outputFilename+"_11_"+dropoutPara+".sh",'w') as fw: + fw.write(outStr) + fw.close() + + for dropoutPara in dropoutList: + commandLine = "python3 -W ignore main_benchmark.py --datasetName 12.Klein --benchmark /home/jwang/data/scData/12.Klein/Klein_cell_label.csv "+scGNNparam+" "+outDirStr+imputeStr+" --dropoutRatio "+dropoutPara+"\n" + outStr = templateStr1 + abbrStr + templateStr2 + commandLine + "\n" + with open(outputFilename+"_12_"+dropoutPara+".sh",'w') as fw: + fw.write(outStr) + fw.close() + + for dropoutPara in dropoutList: + commandLine = "python3 -W ignore main_benchmark.py --datasetName 13.Zeisel --benchmark /home/jwang/data/scData/13.Zeisel/Zeisel_cell_label.csv "+scGNNparam+" "+outDirStr+imputeStr+" --dropoutRatio "+dropoutPara+"\n" + outStr = templateStr1 + abbrStr + templateStr2 + commandLine + "\n" + with open(outputFilename+"_13_"+dropoutPara+".sh",'w') as fw: + fw.write(outStr) + fw.close() diff --git a/generating_distribution.py b/generating_distribution.py new file mode 100644 index 0000000..a69efbb --- /dev/null +++ b/generating_distribution.py @@ -0,0 +1,103 @@ +import argparse + +# python generatingMethodsBatchshell_louvain.py +# python generatingMethodsBatchshell_louvain.py --imputeMode +parser = argparse.ArgumentParser(description='Generating sbatch files for HPC cluster running') +parser.add_argument('--outputDir', type=str, default='', + help='Directory of batch files for cluster running') +args = parser.parse_args() + +templateStr1 = "#! /bin/bash\n"\ +"######################### Batch Headers #########################\n"\ +"#SBATCH -A xulab\n"\ +"#SBATCH -p Lewis,BioCompute # use the BioCompute partition Lewis,BioCompute\n"\ +"#SBATCH -J " + +templateStr2 = "\n#SBATCH -o results-%j.out # give the job output a custom name\n"\ +"#SBATCH -t 2-00:00 # two days time limit\n"\ +"#SBATCH -N 1 # number of nodes\n"\ +"#SBATCH -n 1 # number of cores (AKA tasks)\n"\ +"#SBATCH --mem=128G\n"\ +"#################################################################\n"\ +"module load miniconda3\n"\ +"source activate conda_R\n" + +#tuple list +#batchInfo,scGNNparam,outDir +#huge matrix +methodsList = [ + ('plot_G2E_0.1 G2E1','LTMG_0.1_10-0.1-0.9-0.0-0.3-0.1','npyImputeG2E'), + ('plot_G2E_0.3 G2E3','LTMG_0.3_10-0.1-0.9-0.0-0.3-0.1','npyImputeG2E'), + ('plot_G2E_0.6 G2E6','LTMG_0.6_10-0.1-0.9-0.0-0.3-0.1','npyImputeG2E'), + ('plot_G2E_0.8 G2E8','LTMG_0.8_10-0.1-0.9-0.0-0.3-0.1','npyImputeG2E'), + + ('plot_G2EL_0.1 G2E1','LTMG_0.1_10-0.1-0.9-0.0-0.3-0.1','npyImputeG2E'), + ('plot_G2EL_0.3 G2E3','LTMG_0.3_10-0.1-0.9-0.0-0.3-0.1','npyImputeG2E'), + ('plot_G2EL_0.6 G2E6','LTMG_0.6_10-0.1-0.9-0.0-0.3-0.1','npyImputeG2E'), + ('plot_G2EL_0.8 G2E8','LTMG_0.8_10-0.1-0.9-0.0-0.3-0.1','npyImputeG2E'), + + ('plot_G1E_0.1 G1E1','LTMG_0.1_10-0.1-0.9-0.0-0.3-0.1','npyImputeG1E'), + ('plot_G1E_0.3 G1E3','LTMG_0.3_10-0.1-0.9-0.0-0.3-0.1','npyImputeG1E'), + ('plot_G1E_0.6 G1E6','LTMG_0.6_10-0.1-0.9-0.0-0.3-0.1','npyImputeG1E'), + ('plot_G1E_0.8 G1E8','LTMG_0.8_10-0.1-0.9-0.0-0.3-0.1','npyImputeG1E'), + + ('plot_G2F_0.1 G2F1','LTMG_0.1_10-0.1-0.9-0.0-0.3-0.1','npyImputeG2F'), + ('plot_G2F_0.3 G2F3','LTMG_0.3_10-0.1-0.9-0.0-0.3-0.1','npyImputeG2F'), + ('plot_G2F_0.6 G2F6','LTMG_0.6_10-0.1-0.9-0.0-0.3-0.1','npyImputeG2F'), + ('plot_G2F_0.8 G2F8','LTMG_0.8_10-0.1-0.9-0.0-0.3-0.1','npyImputeG2F'), + + ('plot_N2E_0.1 N2E1','noregu_0.1_10-0.1-0.9-0.0-0.3-0.1','npyImputeN2E'), + ('plot_N2E_0.3 N2E3','noregu_0.3_10-0.1-0.9-0.0-0.3-0.1','npyImputeN2E'), + ('plot_N2E_0.6 N2E6','noregu_0.6_10-0.1-0.9-0.0-0.3-0.1','npyImputeN2E'), + ('plot_N2E_0.8 N2E8','noregu_0.8_10-0.1-0.9-0.0-0.3-0.1','npyImputeN2E'), + +] + +seedList = ['_1/','_2/','_3/'] + +# generate sbatch files: +for item in methodsList: + batchInfo,param,dirStr = item + tmp = batchInfo.split() + tmpstr1=tmp[0] + tmpstr2=tmp[1] + imputeStr = '' + outputFilename = args.outputDir + tmpstr1 + abbrStr = tmpstr2 + + commandLine = '' + for seed in seedList: + commandLine += "python3 -W ignore plot_distribution.py --datasetName 9.Chung --para "+param+" --inDir "+dirStr+seed+" --outDir "+dirStr+seed+"\n" + commandLine += "Rscript plot_distribution.r 9.Chung "+param+" "+dirStr+seed+" "+dirStr+seed+"\n" + outStr = templateStr1 + abbrStr + templateStr2 + commandLine + "\n" + with open(outputFilename+"_9.sh",'w') as fw: + fw.write(outStr) + fw.close() + + commandLine = '' + for seed in seedList: + commandLine += "python3 -W ignore plot_distribution.py --datasetName 11.Kolodziejczyk --para "+param+" --inDir "+dirStr+seed+" --outDir "+dirStr+seed+"\n" + commandLine += "Rscript plot_distribution.r 11.Kolodziejczyk "+param+" "+dirStr+seed+" "+dirStr+seed+"\n" + outStr = templateStr1 + abbrStr + templateStr2 + commandLine + "\n" + with open(outputFilename+"_11.sh",'w') as fw: + fw.write(outStr) + fw.close() + + commandLine = '' + for seed in seedList: + commandLine += "python3 -W ignore plot_distribution.py --datasetName 12.Klein --para "+param+" --inDir "+dirStr+seed+" --outDir "+dirStr+seed+"\n" + commandLine += "Rscript plot_distribution.r 12.Klein "+param+" "+dirStr+seed+" "+dirStr+seed+"\n" + outStr = templateStr1 + abbrStr + templateStr2 + commandLine + "\n" + with open(outputFilename+"_12.sh",'w') as fw: + fw.write(outStr) + fw.close() + + commandLine = '' + for seed in seedList: + commandLine += "python3 -W ignore plot_distribution.py --datasetName 13.Zeisel --para "+param+" --inDir "+dirStr+seed+" --outDir "+dirStr+seed+"\n" + commandLine += "Rscript plot_distribution.r 13.Zeisel "+param+" "+dirStr+seed+" "+dirStr+seed+"\n" + outStr = templateStr1 + abbrStr + templateStr2 + commandLine + "\n" + with open(outputFilename+"_13.sh",'w') as fw: + fw.write(outStr) + fw.close() + diff --git a/graph_function.py b/graph_function.py index f1c65b2..9e30b9a 100644 --- a/graph_function.py +++ b/graph_function.py @@ -71,6 +71,12 @@ def generateAdj(featureMatrix, graphType='KNNgraph', para = None, parallelLimit distanceType = parawords[0] k = int(parawords[1]) edgeList = calculateKNNgraphDistanceMatrixStatsSingleThread(featureMatrix, distanceType=distanceType, k=k) + elif graphType == 'KNNgraphStatsSingleThreadNoPrune': + if para != None: + parawords = para.split(':') + distanceType = parawords[0] + k = int(parawords[1]) + edgeList = calculateKNNgraphDistanceMatrixStatsSingleThreadNoPrune(featureMatrix, distanceType=distanceType, k=k) else: print('Should give graphtype') @@ -330,6 +336,25 @@ def calculateKNNgraphDistanceMatrixStatsSingleThread(featureMatrix, distanceType return edgeList +#para: measuareName:k:threshold no prune only +def calculateKNNgraphDistanceMatrixStatsSingleThreadNoPrune(featureMatrix, distanceType='euclidean', k=10, param=None): + r""" + Thresholdgraph: KNN Graph with stats one-std based methods, SingleThread version, no boundary, + """ + + edgeList=[] + for i in np.arange(featureMatrix.shape[0]): + tmp=featureMatrix[i,:].reshape(1,-1) + distMat = distance.cdist(tmp,featureMatrix, distanceType) + res = distMat.argsort()[:k+1] + for j in np.arange(1,k+1): + # TODO: check, only exclude large outliners + # if (distMat[0,res[0][j]]<=mean+std) and (distMat[0,res[0][j]]>=mean-std): + weight = 1.0 + edgeList.append((i,res[0][j],weight)) + + return edgeList + # kernelDistance def kernelDistance(distance,delta=1.0): ''' diff --git a/main_benchmark.py b/main_benchmark.py index 101de9f..0a295fc 100644 --- a/main_benchmark.py +++ b/main_benchmark.py @@ -23,8 +23,7 @@ # Benchmark for both celltype identification and imputation, needs Preprocessing_main.py first, then proceed by this script. parser = argparse.ArgumentParser(description='Graph EM AutoEncoder for scRNA') parser.add_argument('--datasetName', type=str, default='1.Biase', - help='TGFb/sci-CAR/sci-CAR_LTMG/MMPbasal/MMPbasal_all/MMPbasal_allgene/MMPbasal_allcell/MMPepo/MMPbasal_LTMG/MMPbasal_all_LTMG/MMPbasal_2000') -# Dataset: 1-13 benchmark: 1.Biase/2.Li/3.Treutlein/4.Yan/5.Goolam/6.Guo/7.Deng/8.Pollen/9.Chung/10.Usoskin/11.Kolodziejczyk/12.Klein/13.Zeisel + help='Dataset: 1-13 benchmark: 1.Biase/2.Li/3.Treutlein/4.Yan/5.Goolam/6.Guo/7.Deng/8.Pollen/9.Chung/10.Usoskin/11.Kolodziejczyk/12.Klein/13.Zeisel') parser.add_argument('--batch-size', type=int, default=12800, metavar='N', help='input batch size for training (default: 12800)') parser.add_argument('--epochs', type=int, default=500, metavar='N', @@ -37,7 +36,7 @@ help='EM process type (default: celltypeEM) or EM') parser.add_argument('--alpha', type=float, default=0.5, help='iteration alpha (default: 0.5) to control the converge rate, should be a number between 0~1') -parser.add_argument('--converge-type', type=str, default='either', +parser.add_argument('--converge-type', type=str, default='celltype', help='type of converge: celltype/graph/both/either (default: celltype) ') parser.add_argument('--converge-graphratio', type=float, default=0.01, help='ratio of cell type change in EM iteration (default: 0.01), 0-1') @@ -588,22 +587,34 @@ def train(epoch, train_loader=train_loader, EMFlag=False, taskType='celltype'): # graph criteria if args.converge_type == 'graph': if graphChange < graphChangeThreshold: - print('Converge now!') + print('Graph Converge now!') + # Converge, Update + adjOld = adjNew + listResultOld = listResult break # celltype criteria elif args.converge_type == 'celltype': if ari>args.converge_celltyperatio: - print('Converge now!') + print('Celltype Converge now!') + # Converge, Update + adjOld = adjNew + listResultOld = listResult break # if both criteria are meets elif args.converge_type == 'both': if graphChange < graphChangeThreshold and ari > args.converge_celltyperatio: - print('Converge now!') + print('Graph and Celltype Converge now!') + # Converge, Update + adjOld = adjNew + listResultOld = listResult break # if either criteria are meets elif args.converge_type == 'either': if graphChange < graphChangeThreshold or ari > args.converge_celltyperatio: - print('Converge now!') + print('Graph or Celltype Converge now!') + # Converge, Update + adjOld = adjNew + listResultOld = listResult break # Update diff --git a/main_benchmark_timer.py b/main_benchmark_timer.py new file mode 100644 index 0000000..77d1a9d --- /dev/null +++ b/main_benchmark_timer.py @@ -0,0 +1,745 @@ +import time +import resource +import datetime +import argparse +import sys +import numpy as np +import pickle as pkl +import networkx as nx +import scipy.sparse as sp +import torch +from torch.utils.data import Dataset, DataLoader +from torch import nn, optim +from torch.nn import functional as F +from sklearn.decomposition import PCA +from sklearn.metrics import silhouette_samples, silhouette_score +from sklearn.cluster import KMeans,SpectralClustering,AffinityPropagation,AgglomerativeClustering,Birch,DBSCAN,FeatureAgglomeration,MeanShift,OPTICS +from model import AE, VAE, VAE2d +from util_function import * +from graph_function import * +from benchmark_util import * +from gae_embedding import GAEembedding,measure_clustering_results,test_clustering_benchmark_results +# from LTMG_R import * +import pandas as pd + +# Benchmark for both celltype identification and imputation, needs Preprocessing_main.py first, then proceed by this script. +parser = argparse.ArgumentParser(description='main benchmark for scRNA with timer and mem') +parser.add_argument('--datasetName', type=str, default='1.Biase', + help='Dataset: 1-13 benchmark: 1.Biase/2.Li/3.Treutlein/4.Yan/5.Goolam/6.Guo/7.Deng/8.Pollen/9.Chung/10.Usoskin/11.Kolodziejczyk/12.Klein/13.Zeisel') +parser.add_argument('--batch-size', type=int, default=12800, metavar='N', + help='input batch size for training (default: 12800)') +parser.add_argument('--epochs', type=int, default=500, metavar='N', + help='number of epochs to train in Regulatory Autoencoder (default: 500)') +parser.add_argument('--EM-epochs', type=int, default=200, metavar='N', + help='number of epochs to train in iteration EM (default: 200)') +parser.add_argument('--EM-iteration', type=int, default=10, metavar='N', + help='number of epochs in EM iteration (default: 10)') +parser.add_argument('--EMtype', type=str, default='EM', + help='EM process type (default: celltypeEM) or EM') +parser.add_argument('--alpha', type=float, default=0.5, + help='iteration alpha (default: 0.5) to control the converge rate, should be a number between 0~1') +parser.add_argument('--converge-type', type=str, default='celltype', + help='type of converge: celltype/graph/both/either (default: celltype) ') +parser.add_argument('--converge-graphratio', type=float, default=0.01, + help='ratio of cell type change in EM iteration (default: 0.01), 0-1') +parser.add_argument('--converge-celltyperatio', type=float, default=0.95, + help='ratio of cell type change in EM iteration (default: 0.99), 0-1') +parser.add_argument('--cluster-epochs', type=int, default=200, metavar='N', + help='number of epochs in cluster autoencoder training (default: 200)') +parser.add_argument('--no-cuda', action='store_true', default=False, + help='enables CUDA training') +parser.add_argument('--seed', type=int, default=1, metavar='S', + help='random seed (default: 1)') +parser.add_argument('--regulized-type', type=str, default='LTMG', + help='regulized type (default: LTMG) in EM, otherwise: noregu/LTMG/LTMG01') +parser.add_argument('--reduction', type=str, default='sum', + help='reduction type: mean/sum, default(sum)') +parser.add_argument('--model', type=str, default='AE', + help='VAE/AE (default: AE)') +parser.add_argument('--gammaPara', type=float, default=0.1, + help='regulized parameter (default: 0.1)') +parser.add_argument('--alphaRegularizePara', type=float, default=0.9, + help='regulized parameter (default: 0.9)') + +# imputation related +parser.add_argument('--EMregulized-type', type=str, default='Celltype', + help='regulized type (default: noregu) in EM, otherwise: noregu/Graph/GraphR/Celltype/CelltypeR') +# parser.add_argument('--adjtype', type=str, default='unweighted', +# help='adjtype (default: weighted) otherwise: unweighted') +# parser.add_argument('--aePara', type=str, default='start', +# help='whether use parameter of first feature autoencoder: start/end/cont') +parser.add_argument('--gammaImputePara', type=float, default=0.0, + help='regulized parameter (default: 0.0)') +parser.add_argument('--graphImputePara', type=float, default=0.3, + help='graph parameter (default: 0.3)') +parser.add_argument('--celltypeImputePara', type=float, default=0.1, + help='celltype parameter (default: 0.1)') +parser.add_argument('--L1Para', type=float, default=1.0, + help='L1 regulized parameter (default: 0.001)') +parser.add_argument('--L2Para', type=float, default=0.0, + help='L2 regulized parameter (default: 0.001)') +parser.add_argument('--EMreguTag', action='store_true', default=False, + help='whether regu in EM process') +parser.add_argument('--discreteTag', action='store_true', default=False, + help='whether input is raw or 0/1 (default: False)') +#Build cell graph +parser.add_argument('--k', type=int, default=10, + help='parameter k in KNN graph (default: 10)') +parser.add_argument('--knn-distance', type=str, default='euclidean', + help='KNN graph distance type: euclidean/cosine/correlation (default: euclidean)') +parser.add_argument('--prunetype', type=str, default='KNNgraphStatsSingleThread', + help='prune type, KNNgraphStats/KNNgraphML/KNNgraphStatsSingleThread (default: KNNgraphStats)') +parser.add_argument('--zerofillFlag', action='store_true', default=False, + help='fill zero or not before EM process (default: False)') + +#Debug related +parser.add_argument('--precisionModel', type=str, default='Float', + help='Single Precision/Double precision: Float/Double (default:Float)') +parser.add_argument('--coresUsage', type=str, default='1', + help='how many cores used: all/1/... (default:1)') +parser.add_argument('--npyDir', type=str, default='npyGraphTest/', + help='save npy results in directory') +parser.add_argument('--log-interval', type=int, default=100, metavar='N', + help='how many batches to wait before logging training status') +parser.add_argument('--saveinternal', action='store_true', default=False, + help='whether save internal interation results or not') +parser.add_argument('--debuginfo', action='store_true', default=False, + help='whether output debuginfo in cpu time and memory info') + +#LTMG related +parser.add_argument('--inferLTMGTag', action='store_true', default=False, + help='Whether infer LTMG') +parser.add_argument('--LTMGDir', type=str, default='/home/jwang/data/scData/', + help='directory of LTMGDir, default:(/home/wangjue/biodata/scData/allBench/)') +parser.add_argument('--expressionFile', type=str, default='Biase_expression.csv', + help='expression File in csv') +parser.add_argument('--ltmgFile', type=str, default='ltmg.csv', + help='expression File in csv') + +#Clustering related +parser.add_argument('--useGAEembedding', action='store_true', default=False, + help='whether use GAE embedding for clustering(default: False)') +parser.add_argument('--useBothembedding', action='store_true', default=False, + help='whether use both embedding and Graph embedding for clustering(default: False)') +parser.add_argument('--n-clusters', default=20, type=int, help='number of clusters if predifined for KMeans/Birch ') +parser.add_argument('--clustering-method', type=str, default='LouvainK', + help='Clustering method: Louvain/KMeans/SpectralClustering/AffinityPropagation/AgglomerativeClustering/AgglomerativeClusteringK/Birch/BirchN/MeanShift/OPTICS/LouvainK/LouvainB') +parser.add_argument('--maxClusterNumber', type=int, default=30, + help='max cluster for celltypeEM without setting number of clusters (default: 30)') +parser.add_argument('--minMemberinCluster', type=int, default=5, + help='max cluster for celltypeEM without setting number of clusters (default: 100)') +parser.add_argument('--resolution', type=str, default='auto', + help='the number of resolution on Louvain (default: auto/0.5/0.8)') + + +#Benchmark related +parser.add_argument('--benchmark', type=str, default='/home/jwang/data/scData/13.Zeisel/Zeisel_cell_label.csv', + help='the benchmark file of celltype (default: /home/jwang/data/scData/13.Zeisel/Zeisel_cell_label.csv)') + +#Aggrelated +parser.add_argument('--linkage', type=str, default='ward', + help='linkage should be: ward, average, complete, single') + +#GAE related +parser.add_argument('--GAEmodel', type=str, default='gcn_vae', help="models used") +parser.add_argument('--GAEepochs', type=int, default=200, help='Number of epochs to train.') +parser.add_argument('--GAEhidden1', type=int, default=32, help='Number of units in hidden layer 1.') +parser.add_argument('--GAEhidden2', type=int, default=16, help='Number of units in hidden layer 2.') +parser.add_argument('--GAElr', type=float, default=0.01, help='Initial learning rate.') +parser.add_argument('--GAEdropout', type=float, default=0., help='Dropout rate (1 - keep probability).') +parser.add_argument('--GAElr_dw', type=float, default=0.001, help='Initial learning rate for regularization.') + +#Start Impute or not, only used for evaluating Impute +parser.add_argument('--imputeMode', default=False, action='store_true', + help='impute or not (default: False). Caution: usually change npuDir if set imputeMode as true') +parser.add_argument('--dropoutRatio', type=float, default=0.1, + help='dropout ratio for impute (default: 0.1)') + +args = parser.parse_args() +args.cuda = not args.no_cuda and torch.cuda.is_available() + +#TODO +#As we have lots of parameters, should check args +checkargs(args) + +torch.manual_seed(args.seed) +device = torch.device("cuda" if args.cuda else "cpu") + +if not args.coresUsage == 'all': + torch.set_num_threads(int(args.coresUsage)) + +kwargs = {'num_workers': 1, 'pin_memory': True} if args.cuda else {} +print(args) +start_time = time.time() +print ('---0:00:00---scRNA starts loading.') + +if not args.imputeMode: + # if args.discreteTag: + # scData = scBenchDataset(args.datasetName, args.discreteTag) + # else: + # scData = scBenchDataset(args.datasetName, args.discreteTag, transform=logtransform) + scData = scBenchDataset(args.datasetName, args.discreteTag) +else: + # if args.discreteTag: + # scData = scDatasetDropout(args.datasetName, args.discreteTag, args.dropoutRatio) + # else: + # scData = scDatasetDropout(args.datasetName, args.discreteTag, args.dropoutRatio, transform=logtransform) + scData = scDatasetDropout(datasetName=args.datasetName, discreteTag=args.discreteTag, ratio=args.dropoutRatio, seed=args.seed) +train_loader = DataLoader(scData, batch_size=args.batch_size, shuffle=False, **kwargs) + +if args.inferLTMGTag: + #run LTMG in R + runLTMG(args.LTMGDir+'test/'+args.expressionFile,args.LTMGDir+'test/') + ltmgFile = args.ltmgFile +else: + ltmgFile = args.datasetName+'/T2000_UsingOriginalMatrix/T2000_LTMG.txt' + +regulationMatrix = readLTMGnonsparse(args.LTMGDir, ltmgFile) +regulationMatrix = torch.from_numpy(regulationMatrix) +if args.precisionModel == 'Double': + regulationMatrix = regulationMatrix.type(torch.DoubleTensor) +elif args.precisionModel == 'Float': + regulationMatrix = regulationMatrix.type(torch.FloatTensor) + +# Original +if args.model == 'VAE': + # model = VAE(dim=scData.features.shape[1]).to(device) + model = VAE2d(dim=scData.features.shape[1]).to(device) +elif args.model == 'AE': + model = AE(dim=scData.features.shape[1]).to(device) +if args.precisionModel == 'Double': + model=model.double() +optimizer = optim.Adam(model.parameters(), lr=1e-3) + +#Benchmark +bench_pd=pd.read_csv(args.benchmark,index_col=0) +#t1=pd.read_csv('/home/jwang/data/scData/13.Zeisel/Zeisel_cell_label.csv',index_col=0) +bench_celltype=bench_pd.iloc[:,0].to_numpy() + +#whether to output debuginfo in running time and memory consumption +def debuginfoStr(info): + if args.debuginfo: + print ('---'+str(datetime.timedelta(seconds=int(time.time()-start_time)))+'---'+info) + mem=resource.getrusage(resource.RUSAGE_SELF).ru_maxrss + print('Mem consumption: '+str(mem)) + +debuginfoStr('scRNA has been successfully loaded') + +#TODO: have to improve save npy +def train(epoch, train_loader=train_loader, EMFlag=False, taskType='celltype'): + ''' + EMFlag indicates whether in EM processes. + If in EM, use regulized-type parsed from program entrance, + Otherwise, noregu + taskType: celltype or imputation + ''' + model.train() + train_loss = 0 + # for batch_idx, (data, _) in enumerate(train_loader): + # for batch_idx, data in enumerate(train_loader): + for batch_idx, (data, dataindex) in enumerate(train_loader): + if args.precisionModel == 'Double': + data = data.type(torch.DoubleTensor) + elif args.precisionModel == 'Float': + data = data.type(torch.FloatTensor) + data = data.to(device) + regulationMatrixBatch = regulationMatrix[dataindex,:] + regulationMatrixBatch = regulationMatrixBatch.to(device) + optimizer.zero_grad() + if args.model == 'VAE': + recon_batch, mu, logvar, z = model(data) + # Original + # loss = loss_function(recon_batch, data, mu, logvar) + if taskType == 'celltype': + if EMFlag and (not args.EMreguTag): + loss = loss_function_graph(recon_batch, data.view(-1, recon_batch.shape[1]), mu, logvar, gammaPara=args.gammaPara, regulationMatrix=regulationMatrixBatch, regularizer_type='noregu', reguPara=args.alphaRegularizePara, modelusage=args.model, reduction=args.reduction) + else: + loss = loss_function_graph(recon_batch, data.view(-1, recon_batch.shape[1]), mu, logvar, gammaPara=args.gammaPara, regulationMatrix=regulationMatrixBatch, regularizer_type=args.regulized_type, reguPara=args.alphaRegularizePara, modelusage=args.model, reduction=args.reduction) + elif taskType == 'imputation': + if EMFlag and (not args.EMreguTag): + loss = loss_function_graph_celltype(recon_batch, data.view(-1, recon_batch.shape[1]), mu, logvar, graphregu=adjsample, celltyperegu=celltypesample, gammaPara=args.gammaImputePara, regulationMatrix=regulationMatrixBatch, regularizer_type=args.EMregulized_type, reguPara=args.graphImputePara, reguParaCelltype=args.celltypeImputePara, modelusage=args.model, reduction=args.reduction) + else: + loss = loss_function_graph_celltype(recon_batch, data.view(-1, recon_batch.shape[1]), mu, logvar, graphregu=adjsample, celltyperegu=celltypesample, gammaPara=args.gammaImputePara, regulationMatrix=regulationMatrixBatch, regularizer_type=args.regulized_type, reguPara=args.graphImputePara, reguParaCelltype=args.celltypeImputePara, modelusage=args.model, reduction=args.reduction) + + elif args.model == 'AE': + recon_batch, z = model(data) + mu_dummy = '' + logvar_dummy = '' + # Original + # loss = loss_function(recon_batch, data, mu, logvar) + if taskType == 'celltype': + if EMFlag and (not args.EMreguTag): + loss = loss_function_graph(recon_batch, data.view(-1, recon_batch.shape[1]), mu_dummy, logvar_dummy, gammaPara=args.gammaPara, regulationMatrix=regulationMatrixBatch, regularizer_type='noregu', reguPara=args.alphaRegularizePara, modelusage=args.model, reduction=args.reduction) + else: + loss = loss_function_graph(recon_batch, data.view(-1, recon_batch.shape[1]), mu_dummy, logvar_dummy, gammaPara=args.gammaPara, regulationMatrix=regulationMatrixBatch, regularizer_type=args.regulized_type, reguPara=args.alphaRegularizePara, modelusage=args.model, reduction=args.reduction) + elif taskType == 'imputation': + if EMFlag and (not args.EMreguTag): + loss = loss_function_graph_celltype(recon_batch, data.view(-1, recon_batch.shape[1]), mu_dummy, logvar_dummy, graphregu=adjsample, celltyperegu=celltypesample, gammaPara=args.gammaImputePara, regulationMatrix=regulationMatrixBatch, regularizer_type=args.EMregulized_type, reguPara=args.graphImputePara, reguParaCelltype=args.celltypeImputePara, modelusage=args.model, reduction=args.reduction) + else: + loss = loss_function_graph_celltype(recon_batch, data.view(-1, recon_batch.shape[1]), mu_dummy, logvar_dummy, graphregu=adjsample, celltyperegu=celltypesample, gammaPara=args.gammaImputePara, regulationMatrix=regulationMatrixBatch, regularizer_type=args.regulized_type, reguPara=args.graphImputePara, reguParaCelltype=args.celltypeImputePara, modelusage=args.model, reduction=args.reduction) + + # L1 and L2 regularization in imputation + # 0.0 for no regularization + if taskType == 'imputation': + l1 = 0.0 + l2 = 0.0 + for p in model.parameters(): + l1 = l1 + p.abs().sum() + l2 = l2 + p.pow(2).sum() + loss = loss + args.L1Para * l1 + args.L2Para * l2 + + loss.backward() + train_loss += loss.item() + optimizer.step() + if batch_idx % args.log_interval == 0: + print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format( + epoch, batch_idx * len(data), len(train_loader.dataset), + 100. * batch_idx / len(train_loader), + loss.item() / len(data))) + + # for batch + if batch_idx == 0: + recon_batch_all=recon_batch + data_all = data + z_all = z + else: + recon_batch_all=torch.cat((recon_batch_all, recon_batch), 0) + data_all = torch.cat((data_all, data), 0) + z_all = torch.cat((z_all,z),0) + + print('====> Epoch: {} Average loss: {:.4f}'.format( + epoch, train_loss / len(train_loader.dataset))) + + return recon_batch_all, data_all, z_all + +if __name__ == "__main__": + outParaTag = str(args.k)+'-'+str(args.gammaPara)+'-'+str(args.alphaRegularizePara)+'-'+str(args.gammaImputePara)+'-'+str(args.graphImputePara)+'-'+str(args.celltypeImputePara) + # outParaTag = str(args.gammaImputePara)+'-'+str(args.graphImputePara)+'-'+str(args.celltypeImputePara) + ptfileStart = args.npyDir+args.datasetName+'_'+outParaTag+'_EMtrainingStart.pt' + stateStart = { + # 'epoch': epoch, + 'state_dict': model.state_dict(), + 'optimizer': optimizer.state_dict(), + } + ptfile = args.npyDir+args.datasetName+'_EMtraining.pt' + + # Step 1. celltype clustering + # store parameter + torch.save(stateStart,ptfileStart) + + # Save results only when impute + discreteStr = '' + if args.discreteTag: + discreteStr = 'D' + + if args.imputeMode: + # Does not need now + # save_sparse_matrix(args.npyDir+args.datasetName+'_'+args.regulized_type+discreteStr+'_'+str(args.dropoutRatio)+'_features.npz',scData.features) + # sp.save_npz(args.npyDir+args.datasetName+'_'+args.regulized_type+discreteStr+'_'+str(args.dropoutRatio)+'_features.npz',scData.features) + np.save(args.npyDir+args.datasetName+'_'+args.regulized_type+discreteStr+'_'+str(args.dropoutRatio)+'_'+outParaTag+'_features.npy',scData.features) + np.save(args.npyDir+args.datasetName+'_'+args.regulized_type+discreteStr+'_'+str(args.dropoutRatio)+'_'+outParaTag+'_dropi.npy',scData.i) + np.save(args.npyDir+args.datasetName+'_'+args.regulized_type+discreteStr+'_'+str(args.dropoutRatio)+'_'+outParaTag+'_dropj.npy',scData.j) + np.save(args.npyDir+args.datasetName+'_'+args.regulized_type+discreteStr+'_'+str(args.dropoutRatio)+'_'+outParaTag+'_dropix.npy',scData.ix) + + debuginfoStr('Start feature autoencoder training') + + for epoch in range(1, args.epochs + 1): + recon, original, z = train(epoch, EMFlag=False) + + debuginfoStr('Feature autoencoder training finished') + + zOut = z.detach().cpu().numpy() + # torch.save(model.state_dict(),ptfile) + ptstatus = model.state_dict() + + # Store reconOri for imputation + reconOri = recon.clone() + reconOri = reconOri.detach().cpu().numpy() + + # Step 1. Inferring celltype + #Define resolution + #Default: auto, otherwise use user defined resolution + if args.resolution == 'auto': + if zOut.shape[0]< 2000: + resolution = 0.8 + else: + resolution = 0.5 + else: + resolution = float(args.resolution) + + debuginfoStr('Start construct cell grpah') + # Here para = 'euclidean:10' + # adj, edgeList = generateAdj(zOut, graphType='KNNgraphML', para = args.knn_distance+':'+str(args.k)) + adj, edgeList = generateAdj(zOut, graphType=args.prunetype, para = args.knn_distance+':'+str(args.k), adjTag = (args.useGAEembedding or args.useBothembedding)) + # if args.adjtype == 'unweighted': + # adj, edgeList = generateAdj(zOut, graphType=args.prunetype, para = args.knn_distance+':'+str(args.k)) + # adjdense = sp.csr_matrix.todense(adj) + # elif args.adjtype == 'weighted': + # adj, edgeList = generateAdjWeighted(zOut, graphType=args.prunetype, para = args.knn_distance+':'+str(args.k)) + # adjdense = adj.toarray() + debuginfoStr('Cell Graph constructed and pruned') + + # if args.saveinternal: + # reconOut = recon.detach().cpu().numpy() + # if args.imputeMode: + # np.save(args.npyDir+args.datasetName+'_'+args.regulized_type+discreteStr+'_'+str(args.dropoutRatio)+'_'+outParaTag+'_recon.npy',reconOut) + # np.save(args.npyDir+args.datasetName+'_'+args.regulized_type+discreteStr+'_'+str(args.dropoutRatio)+'_'+outParaTag+'_z.npy',zOut) + # else: + # np.save(args.npyDir+args.datasetName+'_'+args.regulized_type+discreteStr+'_'+outParaTag+'_recon.npy',reconOut) + # np.save(args.npyDir+args.datasetName+'_'+args.regulized_type+discreteStr+'_'+outParaTag+'_z.npy',zOut) + + # Whether use GAE embedding + debuginfoStr('Start Graph Autoencoder training') + if args.useGAEembedding or args.useBothembedding: + zDiscret = zOut>np.mean(zOut,axis=0) + zDiscret = 1.0*zDiscret + if args.useGAEembedding: + zOut=GAEembedding(zDiscret, adj, args) + elif args.useBothembedding: + zEmbedding=GAEembedding(zDiscret, adj, args) + zOut=np.concatenate((zOut,zEmbedding),axis=1) + debuginfoStr('Graph Autoencoder training finished') + + # For iteration studies + G0 = nx.Graph() + G0.add_weighted_edges_from(edgeList) + nlG0=nx.normalized_laplacian_matrix(G0) + # set iteration criteria for converge + adjOld = nlG0 + # set celltype criteria for converge + listResultOld = [1 for i in range(zOut.shape[0])] + + #Fill the zeros before EM iteration + # TODO: better implementation later, now we don't filling zeros for now + if args.zerofillFlag: + for nz_index in range(len(scData.nz_i)): + # tmp = scipy.sparse.lil_matrix.todense(scData.features[scData.nz_i[nz_index], scData.nz_j[nz_index]]) + # tmp = np.asarray(tmp).reshape(-1)[0] + tmp = scData.features[scData.nz_i[nz_index], scData.nz_j[nz_index]] + reconOut[scData.nz_i[nz_index], scData.nz_j[nz_index]] = tmp + recon = reconOut + + debuginfoStr('EM Iteration started') + for bigepoch in range(0, args.EM_iteration): + iteration_time = time.time() + + # Now for both methods, we need do clustering, using clustering results to check converge + # TODO May reimplement later + # Clustering: Get cluster + clustering_time = time.time() + if args.clustering_method=='Louvain': + # Louvain: the only function has R dependent + # Seperate here for platforms without R support + from R_util import generateLouvainCluster + listResult,size = generateLouvainCluster(edgeList) + k = len(np.unique(listResult)) + print('Louvain cluster: '+str(k)) + elif args.clustering_method=='LouvainK': + from R_util import generateLouvainCluster + listResult,size = generateLouvainCluster(edgeList) + k = len(np.unique(listResult)) + print('Louvain cluster: '+str(k)) + # resolution of louvain cluster: + k = int(k*resolution) if k>3 else 2 + clustering = KMeans(n_clusters=k, random_state=0).fit(zOut) + listResult = clustering.predict(zOut) + elif args.clustering_method=='LouvainB': + from R_util import generateLouvainCluster + listResult,size = generateLouvainCluster(edgeList) + k = len(np.unique(listResult)) + print('Louvain cluster: '+str(k)) + # resolution of louvain cluster: + k = int(k*resolution) if k>3 else 2 + clustering = Birch(n_clusters=k).fit(zOut) + listResult = clustering.predict(zOut) + elif args.clustering_method=='KMeans': + clustering = KMeans(n_clusters=args.n_clusters, random_state=0).fit(zOut) + listResult = clustering.predict(zOut) + elif args.clustering_method=='SpectralClustering': + clustering = SpectralClustering(n_clusters=args.n_clusters, assign_labels="discretize", random_state=0).fit(zOut) + listResult = clustering.labels_.tolist() + elif args.clustering_method=='AffinityPropagation': + clustering = AffinityPropagation().fit(zOut) + listResult = clustering.predict(zOut) + elif args.clustering_method=='AgglomerativeClustering': + clustering = AgglomerativeClustering(linkage=args.linkage).fit(zOut) + listResult = clustering.labels_.tolist() + elif args.clustering_method=='AgglomerativeClusteringK': + clustering = AgglomerativeClustering(n_clusters=args.n_clusters).fit(zOut) + listResult = clustering.labels_.tolist() + elif args.clustering_method=='Birch': + clustering = Birch(n_clusters=args.n_clusters).fit(zOut) + listResult = clustering.predict(zOut) + elif args.clustering_method=='BirchN': + clustering = Birch(n_clusters=None).fit(zOut) + listResult = clustering.predict(zOut) + elif args.clustering_method=='MeanShift': + clustering = MeanShift().fit(zOut) + listResult = clustering.labels_.tolist() + elif args.clustering_method=='OPTICS': + clustering = OPTICS(min_samples=int(args.k/2), min_cluster_size=args.minMemberinCluster).fit(zOut) + listResult = clustering.labels_.tolist() + else: + print("Error: Clustering method not appropriate") + # print("---Clustering takes %s seconds ---" % (time.time() - clustering_time)) + + # If clusters more than maxclusters, then have to stop + if len(set(listResult))>args.maxClusterNumber or len(set(listResult))<=1: + print("Stopping: Number of clusters is " + str(len(set(listResult))) + ".") + # Exit + # return None + # Else: dealing with the number + listResult = trimClustering(listResult,minMemberinCluster=args.minMemberinCluster,maxClusterNumber=args.maxClusterNumber) + + #Calculate silhouette + measure_clustering_results(zOut, listResult) + print('Total Cluster Number: '+str(len(set(listResult)))) + + debuginfoStr(str(bigepoch)+'th iter: Cluster Autoencoder training started') + #Graph regulizated EM AE with celltype AE, do the additional AE + if args.EMtype == 'celltypeEM': + # Each cluster has a autoencoder, and organize them back in iteraization + clusterIndexList = [] + for i in range(len(set(listResult))): + clusterIndexList.append([]) + for i in range(len(listResult)): + clusterIndexList[listResult[i]].append(i) + + reconNew = np.zeros((scData.features.shape[0],scData.features.shape[1])) + + # Convert to Tensor + reconNew = torch.from_numpy(reconNew) + if args.precisionModel == 'Double': + reconNew = reconNew.type(torch.DoubleTensor) + elif args.precisionModel == 'Float': + reconNew = reconNew.type(torch.FloatTensor) + reconNew = reconNew.to(device) + + # model.load_state_dict(torch.load(ptfile)) + model.load_state_dict(ptstatus) + + for clusterIndex in clusterIndexList: + reconUsage = recon[clusterIndex] + scDataInter = scDatasetInter(reconUsage) + train_loader = DataLoader(scDataInter, batch_size=args.batch_size, shuffle=False, **kwargs) + for epoch in range(1, args.cluster_epochs + 1): + reconCluster, originalCluster, zCluster = train(epoch, EMFlag=True) + count = 0 + for i in clusterIndex: + reconNew[i] = reconCluster[count,:] + count +=1 + # Update + recon = reconNew + # torch.save(model.state_dict(),ptfile) + ptstatus = model.state_dict() + + debuginfoStr(str(bigepoch)+'th iter: Cluster Autoencoder training succeed') + + # Use new dataloader + scDataInter = scDatasetInter(recon) + train_loader = DataLoader(scDataInter, batch_size=args.batch_size, shuffle=False, **kwargs) + + debuginfoStr(str(bigepoch)+'th iter: Start construct cell grpah') + for epoch in range(1, args.EM_epochs + 1): + recon, original, z = train(epoch, EMFlag=True) + + zOut = z.detach().cpu().numpy() + + # Here para = 'euclidean:10' + # adj, edgeList = generateAdj(zOut, graphType='KNNgraphML', para = args.knn_distance+':'+str(args.k)) + adj, edgeList = generateAdj(zOut, graphType=args.prunetype, para = args.knn_distance+':'+str(args.k), adjTag = (args.useGAEembedding or args.useBothembedding or (bigepoch == int(args.EM_iteration)-1))) + # if args.adjtype == 'unweighted': + # adj, edgeList = generateAdj(zOut, graphType=args.prunetype, para = args.knn_distance+':'+str(args.k)) + # adjdense = sp.csr_matrix.todense(adj) + # elif args.adjtype == 'weighted': + # adj, edgeList = generateAdjWeighted(zOut, graphType=args.prunetype, para = args.knn_distance+':'+str(args.k)) + # adjdense = adj.toarray() + debuginfoStr(str(bigepoch)+'th iter: Cell Graph constructed and pruned') + + debuginfoStr(str(bigepoch)+'th iter: Start Graph Autoencoder training') + # Whether use GAE embedding + if args.useGAEembedding or args.useBothembedding: + zDiscret = zOut>np.mean(zOut,axis=0) + zDiscret = 1.0*zDiscret + if args.useGAEembedding: + zOut=GAEembedding(zDiscret, adj, args) + elif args.useBothembedding: + zEmbedding=GAEembedding(zDiscret, adj, args) + zOut=np.concatenate((zOut,zEmbedding),axis=1) + + debuginfoStr(str(bigepoch)+'th iter: Graph Autoencoder training finished') + + if args.saveinternal: + reconOut = recon.detach().cpu().numpy() + if args.imputeMode: + # np.save(args.npyDir+args.datasetName+'_'+args.regulized_type+discreteStr+'_'+str(args.dropoutRatio)+'_'+outParaTag+'_recon'+str(bigepoch)+'.npy',reconOut) + np.save(args.npyDir+args.datasetName+'_'+args.regulized_type+discreteStr+'_'+str(args.dropoutRatio)+'_'+outParaTag+'_z'+str(bigepoch)+'.npy',zOut) + else: + # np.save(args.npyDir+args.datasetName+'_'+args.regulized_type+discreteStr+'_'+outParaTag+'_recon'+str(bigepoch)+'.npy',reconOut) + np.save(args.npyDir+args.datasetName+'_'+args.regulized_type+discreteStr+'_'+outParaTag+'_z'+str(bigepoch)+'.npy',zOut) + + # print("---One iteration in EM process, proceeded %s seconds ---" % (time.time() - iteration_time)) + + #Iteration usage + Gc = nx.Graph() + Gc.add_weighted_edges_from(edgeList) + adjGc = nx.adjacency_matrix(Gc) + + # Update new adj + adjNew = args.alpha*nlG0 + (1-args.alpha) * adjGc/np.sum(adjGc,axis=0) + + #debug + graphChange = np.mean(abs(adjNew-adjOld)) + graphChangeThreshold = args.converge_graphratio * np.mean(abs(nlG0)) + print('adjNew:{} adjOld:{} G0:{}'.format(adjNew, adjOld, nlG0)) + print('mean:{} threshold:{}'.format(graphChange, graphChangeThreshold)) + silhouette, chs, dbs = measureClusteringNoLabel(zOut, listResult) + ari, ami, nmi, cs, fms, vms, hs = measureClusteringTrueLabel(listResultOld, listResult) + print(listResultOld) + print(listResult) + print('celltype similarity:'+str(ari)) + ari, ami, nmi, cs, fms, vms, hs = measureClusteringTrueLabel(bench_celltype, listResult) + resultarray=[] + resultstr = str(silhouette)+' '+str(chs)+' '+str(dbs)+' '+str(ari)+' '+str(ami)+' '+str(nmi)+' '+str(cs)+' '+str(fms)+' '+str(vms)+' '+str(hs) + resultarray.append(resultstr) + print('All Results: ') + print(resultstr) + + if args.saveinternal: + if args.imputeMode: + np.savetxt(args.npyDir+args.datasetName+'_'+args.regulized_type+'_'+str(args.dropoutRatio)+'_'+outParaTag+'_benchmark'+str(bigepoch)+'.txt',resultarray,fmt='%s') + np.savetxt(args.npyDir+args.datasetName+'_'+args.regulized_type+'_'+str(args.dropoutRatio)+'_'+outParaTag+'_graph'+str(bigepoch)+'.csv',edgeList,fmt='%d,%d,%2.1f') + np.savetxt(args.npyDir+args.datasetName+'_'+args.regulized_type+'_'+str(args.dropoutRatio)+'_'+outParaTag+'_results'+str(bigepoch)+'.txt',listResult,fmt='%d') + else: + np.savetxt(args.npyDir+args.datasetName+'_'+args.regulized_type+'_'+outParaTag+'_benchmark'+str(bigepoch)+'.txt',resultarray,fmt='%s') + np.savetxt(args.npyDir+args.datasetName+'_'+args.regulized_type+'_'+outParaTag+'_graph'+str(bigepoch)+'.csv',edgeList,fmt='%d,%d,%2.1f') + np.savetxt(args.npyDir+args.datasetName+'_'+args.regulized_type+'_'+outParaTag+'_results'+str(bigepoch)+'.txt',listResult,fmt='%d') + + # graph criteria + if args.converge_type == 'graph': + if graphChange < graphChangeThreshold: + print('Graph Converge now!') + # Converge, Update + adjOld = adjNew + listResultOld = listResult + break + # celltype criteria + elif args.converge_type == 'celltype': + if ari>args.converge_celltyperatio: + print('Celltype Converge now!') + # Converge, Update + adjOld = adjNew + listResultOld = listResult + break + # if both criteria are meets + elif args.converge_type == 'both': + if graphChange < graphChangeThreshold and ari > args.converge_celltyperatio: + print('Graph and Celltype Converge now!') + # Converge, Update + adjOld = adjNew + listResultOld = listResult + break + # if either criteria are meets + elif args.converge_type == 'either': + if graphChange < graphChangeThreshold or ari > args.converge_celltyperatio: + print('Graph or Celltype Converge now!') + # Converge, Update + adjOld = adjNew + listResultOld = listResult + break + + # Update + adjOld = adjNew + listResultOld = listResult + # torch.cuda.empty_cache() + debuginfoStr(str(bigepoch)+'th iter: Iteration finished') + + + # Output celltype related results + if args.imputeMode: + np.save(args.npyDir+args.datasetName+'_'+args.regulized_type+discreteStr+'_'+str(args.dropoutRatio)+'_'+outParaTag+'_final_edgeList.npy',edgeList) + else: + np.save(args.npyDir+args.datasetName+'_'+args.regulized_type+discreteStr+'_'+outParaTag+'_final_edgeList.npy',edgeList) + + # np.savetxt(args.npyDir+args.datasetName+'_'+args.regulized_type+'_'+outParaTag+'_'+str(args.L1Para)+'_'+str(args.L2Para)+'_recon.csv',reconOut,delimiter=",",fmt='%10.4f') + np.savetxt(args.npyDir+args.datasetName+'_'+args.regulized_type+'_'+outParaTag+'_'+str(args.L1Para)+'_'+str(args.L2Para)+'_embedding.csv',zOut, delimiter=",",fmt='%10.4f') + np.savetxt(args.npyDir+args.datasetName+'_'+args.regulized_type+'_'+outParaTag+'_'+str(args.L1Para)+'_'+str(args.L2Para)+'_graph.csv',edgeList,fmt='%d,%d,%2.1f') + np.savetxt(args.npyDir+args.datasetName+'_'+args.regulized_type+'_'+outParaTag+'_'+str(args.L1Para)+'_'+str(args.L2Para)+'_results.txt',listResult,fmt='%d') + + resultarray=[] + silhouette, chs, dbs = measureClusteringNoLabel(zOut, listResult) + ari, ami, nmi, cs, fms, vms, hs = measureClusteringTrueLabel(bench_celltype, listResult) + resultstr = str(silhouette)+' '+str(chs)+' '+str(dbs)+' '+str(ari)+' '+str(ami)+' '+str(nmi)+' '+str(cs)+' '+str(fms)+' '+str(vms)+' '+str(hs) + resultarray.append(resultstr) + np.savetxt(args.npyDir+args.datasetName+'_'+args.regulized_type+'_'+outParaTag+'_'+str(args.L1Para)+'_'+str(args.L2Para)+'_benchmark.txt',resultarray,fmt='%s') + + # save internal results for imputation + # if args.imputeMode: + # np.save(args.npyDir+args.datasetName+'_'+str(args.dropoutRatio)+'_'+args.regulized_type+'_reconOri.npy',reconOri) + # np.save(args.npyDir+args.datasetName+'_'+str(args.dropoutRatio)+'_'+args.regulized_type+'_adj.npy',adj) + # np.save(args.npyDir+args.datasetName+'_'+str(args.dropoutRatio)+'_'+args.regulized_type+'_listResult.npy',listResult) + # else: + # np.save(args.npyDir+args.datasetName+'_'+args.regulized_type+'_reconOri.npy',reconOri) + # np.save(args.npyDir+args.datasetName+'_'+args.regulized_type+'_adj.npy',adj) + # np.save(args.npyDir+args.datasetName+'_'+args.regulized_type+'_listResult.npy',listResult) + + # Step 2. Imputation with best results of graph and celltype + + # if args.imputeMode: + # reconOri = np.load(args.npyDir+args.datasetName+'_'+str(args.dropoutRatio)+'_'+args.regulized_type+'_reconOri.npy') + # adj = np.load(args.npyDir+args.datasetName+'_'+str(args.dropoutRatio)+'_'+args.regulized_type+'_adj.npy',allow_pickle=True) + # listResult = np.load(args.npyDir+args.datasetName+'_'+str(args.dropoutRatio)+'_'+args.regulized_type+'_listResult.npy') + # else: + # reconOri = np.load(args.npyDir+args.datasetName+'_'+args.regulized_type+'_reconOri.npy') + # adj = np.load(args.npyDir+args.datasetName+'_'+args.regulized_type+'_adj.npy',allow_pickle=True) + # listResult = np.load(args.npyDir+args.datasetName+'_'+args.regulized_type+'_listResult.npy') + + # Use new dataloader + scDataInter = scDatasetInter(reconOri) + train_loader = DataLoader(scDataInter, batch_size=args.batch_size, shuffle=False, **kwargs) + + stateStart = torch.load(ptfileStart) + model.load_state_dict(stateStart['state_dict']) + optimizer.load_state_dict(stateStart['optimizer']) + # if args.aePara == 'start': + # model.load_state_dict(torch.load(ptfileStart)) + # elif args.aePara == 'end': + # model.load_state_dict(torch.load(ptfileEnd)) + + # generate graph regularizer from graph + # adj = adj.tolist() # Used for read/load + # adjdense = sp.csr_matrix.todense(adj) + + # generate adj from edgeList + adjdense = sp.csr_matrix.todense(adj) + adjsample = torch.from_numpy(adjdense) + if args.precisionModel == 'Float': + adjsample = adjsample.float() + elif args.precisionModel == 'Double': + adjsample = adjsample.type(torch.DoubleTensor) + adjsample = adjsample.to(device) + + # generate celltype regularizer from celltype + celltypesample = generateCelltypeRegu(listResult) + + celltypesample = torch.from_numpy(celltypesample) + if args.precisionModel == 'Float': + celltypesample = celltypesample.float() + elif args.precisionModel == 'Double': + celltypesample = celltypesample.type(torch.DoubleTensor) + celltypesample = celltypesample.to(device) + + for epoch in range(1, args.EM_epochs + 1): + recon, original, z = train(epoch, EMFlag=True, taskType='imputation') + + reconOut = recon.detach().cpu().numpy() + + # out imputation Results + if args.imputeMode: + np.save (args.npyDir+args.datasetName+'_'+args.regulized_type+'_'+str(args.dropoutRatio)+'_'+outParaTag+'_recon.npy',reconOut) + np.savetxt(args.npyDir+args.datasetName+'_'+args.regulized_type+'_'+str(args.dropoutRatio)+'_'+outParaTag+'_recon.csv',reconOut,delimiter=",",fmt='%10.4f') + else: + np.save (args.npyDir+args.datasetName+'_'+args.regulized_type+'_'+outParaTag+'_recon.npy',reconOut) + np.savetxt(args.npyDir+args.datasetName+'_'+args.regulized_type+'_'+outParaTag+'_recon.csv',reconOut,delimiter=",",fmt='%10.4f') + + debuginfoStr('scGNN finished') diff --git a/plot_distribution.py b/plot_distribution.py new file mode 100644 index 0000000..081f6ad --- /dev/null +++ b/plot_distribution.py @@ -0,0 +1,108 @@ +import numpy as np +import matplotlib.pyplot as plt +import argparse +from scipy.stats import chi2_contingency +from scipy.stats import nbinom + +parser = argparse.ArgumentParser(description='Infer Spatial from Expression in single cells') + +parser.add_argument('--datasetName', type=str, default='1.Biase', + help='Dataset: 1-13 benchmark: 1.Biase/2.Li/3.Treutlein/4.Yan/5.Goolam/6.Guo/7.Deng/8.Pollen/9.Chung/10.Usoskin/11.Kolodziejczyk/12.Klein/13.Zeisel') +parser.add_argument('--para', type=str, default='LTMG_0.1_10-0.1-0.9-0.0-0.3-0.1', + help='save npy results in directory') +parser.add_argument('--inDir', type=str, default='npyGraphTest/', + help='save npy results in directory') +parser.add_argument('--outDir', type=str, default='DistNpy/', + help='save npy results in directory') +args = parser.parse_args() + + +ix=np.load(args.inDir+args.datasetName+'_'+args.para+'_dropix.npy') +i =np.load(args.inDir+args.datasetName+'_'+args.para+'_dropi.npy') +j =np.load(args.inDir+args.datasetName+'_'+args.para+'_dropj.npy') +# recon =np.load('12.Klein_LTMG_0.1_10-0.1-0.9-0.0-0.3-0.1_recon.npy',allow_pickle=True) +# features=np.load('/Users/juexinwang/Downloads/temp/12.Klein_LTMG_0.1_10-0.1-0.9-0.0-0.3-0.1_features.npy',allow_pickle=True) +recon =np.load(args.inDir+args.datasetName+'_'+args.para+'_recon.npy',allow_pickle=True) +features=np.load(args.inDir+args.datasetName+'_'+args.para+'_features.npy',allow_pickle=True) +features=features.tolist() +features=features.todense() + +# Directly use plt histogram +# Careful! plt.hist does not work for huge datasets + +# _ = plt.hist(features.ravel()) +# plt.savefig(args.outDir+'/'+args.datasetName+'_'+args.para+'_features.png') +# plt.close() + +# features_log = np.log(features+1) +# _ = plt.hist(features_log.ravel(),bin=100) +# plt.savefig(args.outDir+'/'+args.datasetName+'_'+args.para+'_features_log.png') +# plt.close() + +# _ = plt.hist(recon.ravel(),bin=100) +# plt.savefig(args.outDir+'/'+args.datasetName+'_'+args.para+'_recon.png') +# plt.close() + +# recon_exp = np.exp(recon)-1 +# plt.savefig(args.outDir+'/'+args.datasetName+'_'+args.para+'_recon_exp.png') +# plt.close() + +# Something wrong, have to change to here: +# plt.bar(bin_edges[:-1], hist) +# plt.xlim(min(bin_edges), max(bin_edges)) + +# Use numpy histogram +hist, bin_edges = np.histogram(features.ravel(), bins = np.arange(0,np.max(features)+10,10)) +# print(hist) +x_pos = [i for i, _ in enumerate(hist)] +plt.bar(x_pos, hist) +plt.xticks(x_pos, bin_edges[:-1]) +plt.xticks(rotation=90) +plt.savefig(args.outDir+'/'+args.datasetName+'_'+args.para+'_features.png') +plt.close() + +features_log = np.log(features+1) +hist, bin_edges = np.histogram(features_log.ravel(), bins = np.arange(0,np.max(features_log)+0.1,0.1)) +# print(hist) +x_pos = [i for i, _ in enumerate(hist)] +plt.bar(x_pos, hist) +plt.xticks(x_pos, bin_edges[:-1]) +plt.xticks(rotation=90) +plt.savefig(args.outDir+'/'+args.datasetName+'_'+args.para+'_features_log.png') +plt.close() + +hist, bin_edges = np.histogram(recon.ravel(), bins = np.arange(0,np.max(recon)+0.1,0.1)) +# print(hist) +x_pos = [i for i, _ in enumerate(hist)] +plt.bar(x_pos, hist) +plt.xticks(x_pos, bin_edges[:-1]) +plt.xticks(rotation=90) +plt.savefig(args.outDir+'/'+args.datasetName+'_'+args.para+'_recon.png') +plt.close() + +recon_exp = np.exp(recon)-1 +hist, bin_edges = np.histogram(recon_exp.ravel(), bins = np.arange(0,np.max(recon_exp)+10,10)) +print(hist) +x_pos = [i for i, _ in enumerate(hist)] +plt.bar(x_pos, hist) +plt.xticks(x_pos, bin_edges[:-1]) +plt.xticks(rotation=90) +plt.savefig(args.outDir+'/'+args.datasetName+'_'+args.para+'_recon_exp.png') +plt.close() + +#test +# find x,y in 2D matrix +# numpy.unravel_index(a.argmax(), a.shape) +# data = [[207, 282, 241], [282, 240, 234, 3]] +# chi2_contingency(data) +np.savetxt(args.outDir+'/'+args.datasetName+'_'+args.para+'_features.txt', features, fmt='%d') + +# https://stats.stackexchange.com/questions/260580/negative-binomial-distribution-with-python-scipy-stats +# https://en.wikipedia.org/wiki/Negative_binomial_distribution#Alternative_formulations +# mean = np.mean(features) +# var = np.var(features) +# p = (var-mean)/var +# r = mean**2/(var-mean) +# x = np.arange(nbinom.ppf(0.01, p, r),nbinom.ppf(0.99, p, r)) +# ax.plot(x, nbinom.pmf(x, p, r), 'bo', ms=8, label='nbinom pmf') + diff --git a/plot_distribution.r b/plot_distribution.r new file mode 100644 index 0000000..c09f27b --- /dev/null +++ b/plot_distribution.r @@ -0,0 +1,84 @@ +# R +# Running after plot_distribution.py + +# http://www.gamlss.com/wp-content/uploads/2013/01/book-2010-Athens1.pdf +# https://arxiv.org/pdf/1810.02618.pdf +# https://rdrr.io/cran/gamlss.dist/man/ZANBI.html + +#install in conda: +# https://anaconda.org/conda-forge/r-fitdistrplus +# https://anaconda.org/conda-forge/r-gamlss +# install.packages("fitdistrplus") +# install.packages("gamlss") +library(fitdistrplus) +library(gamlss) + +args = commandArgs(trailingOnly=TRUE) +if (length(args)==0) { + stop("At least four argument must be supplied (input file).n", call.=FALSE) +} + +datasetName=args[1] +para=args[2] +indir=args[3] +outdir=args[4] + +features = read.table(paste(indir,"/",datasetName,"_",para,"_features.txt",sep=''), header = FALSE, sep = " ") +features = data.matrix(features) +features = as.vector(features) +features = as.numeric(features) + +print(paste(indir,"/",datasetName,"_",para,"_features.txt",sep='')) +mu_ = mean(features) +sigma_ = (sd(features)-mean(features))/mean(features)**2 +# http://www.gamlss.com/wp-content/uploads/2013/01/book-2010-Athens1.pdf Page 219 +fit_nbi = fitdist(features, 'NBI', start = list(mu = mu_, sigma = sigma_ )) +gofstat(fit_nbi) +tiff(file= paste(outdir,"/",datasetName,"_",para,"_NBI.tiff",sep='')) +plot(fit_nbi) +dev.off() + +# http://www.gamlss.com/wp-content/uploads/2013/01/book-2010-Athens1.pdf Page 221 +fit_zinb_= fitdist(features, 'ZINBI', start = list(mu = mu_, sigma = sigma_)) +gofstat(fit_zinb_) +tiff(file=paste(outdir,"/",datasetName,"_",para,"_ZINBI_.tiff",sep='')) +plot(fit_zinb_) +dev.off() + +nu_ = 1-length(which(features!=0))/(length(features)) +fit_zinb= fitdist(features, 'ZINBI', start = list(mu = mu_, sigma = sigma_, nu = nu_)) +gofstat(fit_zinb) +tiff(file=paste(outdir,"/",datasetName,"_",para,"_ZINBI.tiff",sep='')) +plot(fit_zinb) +dev.off() + + + + +# NBI: +# Goodness-of-fit statistics +# 1-mle-NBI +# Kolmogorov-Smirnov statistic 3.671374e-01 +# Cramer-von Mises statistic 1.016737e+05 +# Anderson-Darling statistic Inf + +# Goodness-of-fit criteria +# 1-mle-NBI +# Akaike's Information Criterion 25429885 +# Bayesian Information Criterion 25429912 + + +# ZINB +# Goodness-of-fit statistics +# 1-mle-ZINBI +# Kolmogorov-Smirnov statistic 4.532250e-01 +# Cramer-von Mises statistic 1.873046e+05 +# Anderson-Darling statistic Inf + +# Goodness-of-fit criteria +# 1-mle-ZINBI +# Akaike's Information Criterion 25969108 +# Bayesian Information Criterion 25969135 + +# Can learn from * +# https://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.nbinom.html \ No newline at end of file diff --git a/requirements.txt b/requirements.txt index fd0d666..9b91edd 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,11 +1,11 @@ -numpy==1.18.1 -torch==1.4.0 -networkx==2.4 -pandas==0.25.3 -rpy2==3.2.4 -matplotlib==3.1.2 -seaborn==0.9.0 -umap-learn==0.3.10 -munkres==1.1.2 +numpy +torch>=1.4.0 +networkx>=2.4 +pandas>=0.25.3 +rpy2>=3.2.4 +matplotlib>=3.1.2 +seaborn>=0.9.0 +umap-learn +munkres>=1.1.2 community -tqdm==4.48.0 \ No newline at end of file +tqdm>=4.48.0 \ No newline at end of file diff --git a/results/Klein_correlation.py b/results/Klein_correlation.py new file mode 100644 index 0000000..05b86e3 --- /dev/null +++ b/results/Klein_correlation.py @@ -0,0 +1,82 @@ +import numpy as np +from scipy import stats +import pandas as pd +import csv + +# Get correlation from gene interactions from Klein datasets in Figure 3 of scGNN paper +# Ref: Klein, Allon M., et al. "Droplet barcoding for single-cell transcriptomics applied to embryonic stem cells." Cell 161.5 (2015): 1187-1201. + +geneList=[ + 'Krt8', #4 + 'S100a6', #19 + 'Id2', #895 + 'Id1', #602 + 'ld3', #1559 + 'Ccnd1',# not in the range + 'Ccnb1',# not in the range + 'Ccnd2',# not in the range + 'Ccna1',# not in the range + 'Sox17',# not in the range + 'Col4a1', #226 + 'Pou5f1', #150 + 'Ccnd3', #255 + 'Ccna2',# not in the range + 'Nanog', #1449 + 'Klf4',# not in the range + 'Sox2', # 601 + 'Zfp42', #527 + 'Trim28', #136 + 'Esrrb', #849 + 'Tdh', #206 +] + +geneNumList=[ + 4, + 19, + 895, + 602, + 1559, + 226, + 150, + 255, + 1449, + 601, + 527, + 136, + 849, + 206, +] + +savedir = './fig3/' +methodList = ['magic','saucie','saver','scimpute','scvi','scvinorm','dca','deepimpute','scIGANs','netNMFsc'] + +def corCal(method='magic'): + if method == 'scIGANs': + df = pd.read_csv('/storage/htc/joshilab/jghhd/singlecellTest/scIGAN/Result_200_0.0/12.Klein/scIGANs_npyImputeG2E_1_12.Klein_LTMG_0.0_10-0.1-0.9-0.0-0.3-0.1_features_log.csv_Klein_only_label.csv.txt',sep='\s+',index_col=0) + x = df.to_numpy() + else: + if method == 'scvinorm': + filename = '/storage/htc/joshilab/wangjue/scGNN/scvi/12.Klein_0.0_1_recon_normalized.npy' + x = np.load(filename,allow_pickle=True) + x = x.T + elif method == 'netNMFsc': + filename = '/storage/htc/joshilab/jghhd/singlecellTest/netNMFsc/result_mi_100000/0.0/12.Klein/npyImputeG2E_1_log_imputation.npy' + x = np.load(filename,allow_pickle=True) + else: + filename = '/storage/htc/joshilab/wangjue/scGNN/{}/12.Klein_0.0_1_recon.npy'.format(method) + x = np.load(filename,allow_pickle=True) + x = x.T + + corr = np.zeros((len(geneNumList),len(geneNumList))) + for i in range(len(geneNumList)): + for j in range(len(geneNumList)): + corr[i,j]=stats.pearsonr(x[geneNumList[i],:], x[geneNumList[j],:])[0] + + out_filename = savedir+method+".csv" + with open(out_filename, "w") as f: + writer = csv.writer(f) + writer.writerows(corr) + + +for method in methodList: + corCal(method=method) \ No newline at end of file diff --git a/results/Klein_correlation.sh b/results/Klein_correlation.sh new file mode 100644 index 0000000..01eb788 --- /dev/null +++ b/results/Klein_correlation.sh @@ -0,0 +1,14 @@ +#! /bin/bash +######################### Batch Headers ######################### +#SBATCH -A xulab +#SBATCH -p Lewis,BioCompute # use the BioCompute partition Lewis,BioCompute +#SBATCH -J Fig3 +#SBATCH -o results-%j.out # give the job output a custom name +#SBATCH -t 2-00:00 # two days time limit +#SBATCH -N 1 # number of nodes +#SBATCH -n 1 # number of cores (AKA tasks) +#SBATCH --mem=128G +################################################################# +module load miniconda3 +source activate conda_R +python -W ignore Klein_correlation.py \ No newline at end of file diff --git a/results/louvain.py b/results/louvain.py new file mode 100644 index 0000000..967b2d6 --- /dev/null +++ b/results/louvain.py @@ -0,0 +1,39 @@ +import os, sys +sys.path.append(os.path.join(os.path.dirname(os.path.realpath(__file__)), os.pardir)) +import numpy as np +from util_function import * +from graph_function import * +from R_util import generateLouvainCluster +import argparse + +parser = argparse.ArgumentParser(description='main benchmark for scRNA with timer and mem') +parser.add_argument('--k', type=int, default=10, + help='parameter k in KNN graph (default: 10)') +parser.add_argument('--knn-distance', type=str, default='euclidean', + help='KNN graph distance type: euclidean/cosine/correlation (default: euclidean)') +parser.add_argument('--prunetype', type=str, default='KNNgraphStatsSingleThreadNoPrune', + help='prune type, KNNgraphStats/KNNgraphML/KNNgraphStatsSingleThread (default: KNNgraphStats)') +#Benchmark related +parser.add_argument('--benchmark', type=str, default='/home/jwang/data/scData/13.Zeisel/Zeisel_cell_label.csv', + help='the benchmark file of celltype (default: /home/jwang/data/scData/13.Zeisel/Zeisel_cell_label.csv)') +parser.add_argument('--input', type=str, default='filename', + help='input filename') +parser.add_argument('--output', type=str, default='filename', + help='input filename') +args = parser.parse_args() + +#Benchmark +bench_pd=pd.read_csv(args.benchmark,index_col=0) +bench_celltype=bench_pd.iloc[:,0].to_numpy() + +zOut = np.load(args.input,allow_pickle=True) +zOut,re = pcaFunc(zOut, n_components=10) +adj, edgeList = generateAdj(zOut, graphType=args.prunetype, para = args.knn_distance+':'+str(args.k)) +listResult,size = generateLouvainCluster(edgeList) +silhouette, chs, dbs = measureClusteringNoLabel(zOut, listResult) +ari, ami, nmi, cs, fms, vms, hs = measureClusteringTrueLabel(bench_celltype, listResult) +resultstr = str(silhouette)+' '+str(chs)+' '+str(dbs)+' '+str(ari)+' '+str(ami)+' '+str(nmi)+' '+str(cs)+' '+str(fms)+' '+str(vms)+' '+str(hs) +print(resultstr) + +with open(args.output,'w') as fw: + fw.writelines("%s\n" % strr for strr in listResult) diff --git a/results/louvain.sh b/results/louvain.sh new file mode 100644 index 0000000..3f51bea --- /dev/null +++ b/results/louvain.sh @@ -0,0 +1,57 @@ +#! /bin/bash +######################### Batch Headers ######################### +#SBATCH -A xulab +#SBATCH -p Lewis,BioCompute # use the BioCompute partition Lewis,BioCompute +#SBATCH -J Louvain +#SBATCH -o results-%j.out # give the job output a custom name +#SBATCH -t 2-00:00 # two days time limit +#SBATCH -N 1 # number of nodes +#SBATCH -n 1 # number of cores (AKA tasks) +#SBATCH --mem=128G +################################################################# +module load miniconda3 +source activate conda_R +python -W ignore louvain.py --input othermethods/magic/9.Chung_0.0_1_recon.npy --output otherresults/magic/9.txt --benchmark /home/jwang/data/scData/9.Chung/Chung_cell_label.csv +python -W ignore louvain.py --input othermethods/magic/11.Kolodziejczyk_0.0_1_recon.npy --output otherresults/magic/11.txt --benchmark /home/jwang/data/scData/11.Kolodziejczyk/Kolodziejczyk_cell_label.csv +python -W ignore louvain.py --input othermethods/magic/12.Klein_0.0_1_recon.npy --output otherresults/magic/12.txt --benchmark /home/jwang/data/scData/12.Klein/Klein_cell_label.csv +python -W ignore louvain.py --input othermethods/magic/13.Zeisel_0.0_1_recon.npy --output otherresults/magic/13.txt --benchmark /home/jwang/data/scData/13.Zeisel/Zeisel_cell_label.csv + +python -W ignore louvain.py --input othermethods/dca/9.Chung_0.0_1_recon.npy --output otherresults/dca/9.txt --benchmark /home/jwang/data/scData/9.Chung/Chung_cell_label.csv +python -W ignore louvain.py --input othermethods/dca/11.Kolodziejczyk_0.0_1_recon.npy --output otherresults/dca/11.txt --benchmark /home/jwang/data/scData/11.Kolodziejczyk/Kolodziejczyk_cell_label.csv +python -W ignore louvain.py --input othermethods/dca/12.Klein_0.0_1_recon.npy --output otherresults/dca/12.txt --benchmark /home/jwang/data/scData/12.Klein/Klein_cell_label.csv +python -W ignore louvain.py --input othermethods/dca/13.Zeisel_0.0_1_recon.npy --output otherresults/dca/13.txt --benchmark /home/jwang/data/scData/13.Zeisel/Zeisel_cell_label.csv + +python -W ignore louvain.py --input othermethods/deepimpute/9.Chung_0.0_1_recon.npy --output otherresults/deepimpute/9.txt --benchmark /home/jwang/data/scData/9.Chung/Chung_cell_label.csv +python -W ignore louvain.py --input othermethods/deepimpute/11.Kolodziejczyk_0.0_1_recon.npy --output otherresults/deepimpute/11.txt --benchmark /home/jwang/data/scData/11.Kolodziejczyk/Kolodziejczyk_cell_label.csv +python -W ignore louvain.py --input othermethods/deepimpute/12.Klein_0.0_1_recon.npy --output otherresults/deepimpute/12.txt --benchmark /home/jwang/data/scData/12.Klein/Klein_cell_label.csv +python -W ignore louvain.py --input othermethods/deepimpute/13.Zeisel_0.0_1_recon.npy --output otherresults/deepimpute/13.txt --benchmark /home/jwang/data/scData/13.Zeisel/Zeisel_cell_label.csv + +python -W ignore louvain.py --input othermethods/netNMFsc/9.Chung_0.0_1_recon.npy --output otherresults/netNMFsc/9.txt --benchmark /home/jwang/data/scData/9.Chung/Chung_cell_label.csv +python -W ignore louvain.py --input othermethods/netNMFsc/11.Kolodziejczyk_0.0_1_recon.npy --output otherresults/netNMFsc/11.txt --benchmark /home/jwang/data/scData/11.Kolodziejczyk/Kolodziejczyk_cell_label.csv +python -W ignore louvain.py --input othermethods/netNMFsc/12.Klein_0.0_1_recon.npy --output otherresults/netNMFsc/12.txt --benchmark /home/jwang/data/scData/12.Klein/Klein_cell_label.csv +python -W ignore louvain.py --input othermethods/netNMFsc/13.Zeisel_0.0_1_recon.npy --output otherresults/netNMFsc/13.txt --benchmark /home/jwang/data/scData/13.Zeisel/Zeisel_cell_label.csv + +python -W ignore louvain.py --input othermethods/saucie/9.Chung_0.0_1_recon.npy --output otherresults/saucie/9.txt --benchmark /home/jwang/data/scData/9.Chung/Chung_cell_label.csv +python -W ignore louvain.py --input othermethods/saucie/11.Kolodziejczyk_0.0_1_recon.npy --output otherresults/saucie/11.txt --benchmark /home/jwang/data/scData/11.Kolodziejczyk/Kolodziejczyk_cell_label.csv +python -W ignore louvain.py --input othermethods/saucie/12.Klein_0.0_1_recon.npy --output otherresults/saucie/12.txt --benchmark /home/jwang/data/scData/12.Klein/Klein_cell_label.csv +python -W ignore louvain.py --input othermethods/saucie/13.Zeisel_0.0_1_recon.npy --output otherresults/saucie/13.txt --benchmark /home/jwang/data/scData/13.Zeisel/Zeisel_cell_label.csv + +python -W ignore louvain.py --input othermethods/saver/9.Chung_0.0_1_recon.npy --output otherresults/saver/9.txt --benchmark /home/jwang/data/scData/9.Chung/Chung_cell_label.csv +python -W ignore louvain.py --input othermethods/saver/11.Kolodziejczyk_0.0_1_recon.npy --output otherresults/saver/11.txt --benchmark /home/jwang/data/scData/11.Kolodziejczyk/Kolodziejczyk_cell_label.csv +python -W ignore louvain.py --input othermethods/saver/12.Klein_0.0_1_recon.npy --output otherresults/saver/12.txt --benchmark /home/jwang/data/scData/12.Klein/Klein_cell_label.csv +python -W ignore louvain.py --input othermethods/saver/13.Zeisel_0.0_1_recon.npy --output otherresults/saver/13.txt --benchmark /home/jwang/data/scData/13.Zeisel/Zeisel_cell_label.csv + +python -W ignore louvain.py --input othermethods/scIGANs/9.Chung_0.0_1_recon.npy --output otherresults/scIGANs/9.txt --benchmark /home/jwang/data/scData/9.Chung/Chung_cell_label.csv +python -W ignore louvain.py --input othermethods/scIGANs/11.Kolodziejczyk_0.0_1_recon.npy --output otherresults/scIGANs/11.txt --benchmark /home/jwang/data/scData/11.Kolodziejczyk/Kolodziejczyk_cell_label.csv +python -W ignore louvain.py --input othermethods/scIGANs/12.Klein_0.0_1_recon.npy --output otherresults/scIGANs/12.txt --benchmark /home/jwang/data/scData/12.Klein/Klein_cell_label.csv +python -W ignore louvain.py --input othermethods/scIGANs/13.Zeisel_0.0_1_recon.npy --output otherresults/scIGANs/13.txt --benchmark /home/jwang/data/scData/13.Zeisel/Zeisel_cell_label.csv + +python -W ignore louvain.py --input othermethods/scimpute/9.Chung_0.0_1_recon.npy --output otherresults/scimpute/9.txt --benchmark /home/jwang/data/scData/9.Chung/Chung_cell_label.csv +python -W ignore louvain.py --input othermethods/scimpute/11.Kolodziejczyk_0.0_1_recon.npy --output otherresults/scimpute/11.txt --benchmark /home/jwang/data/scData/11.Kolodziejczyk/Kolodziejczyk_cell_label.csv +python -W ignore louvain.py --input othermethods/scimpute/12.Klein_0.0_1_recon.npy --output otherresults/scimpute/12.txt --benchmark /home/jwang/data/scData/12.Klein/Klein_cell_label.csv +python -W ignore louvain.py --input othermethods/scimpute/13.Zeisel_0.0_1_recon.npy --output otherresults/scimpute/13.txt --benchmark /home/jwang/data/scData/13.Zeisel/Zeisel_cell_label.csv + +python -W ignore louvain.py --input othermethods/scvi/9.Chung_0.0_1_recon.npy --output otherresults/scvi/9.txt --benchmark /home/jwang/data/scData/9.Chung/Chung_cell_label.csv +python -W ignore louvain.py --input othermethods/scvi/11.Kolodziejczyk_0.0_1_recon.npy --output otherresults/scvi/11.txt --benchmark /home/jwang/data/scData/11.Kolodziejczyk/Kolodziejczyk_cell_label.csv +python -W ignore louvain.py --input othermethods/scvi/12.Klein_0.0_1_recon.npy --output otherresults/scvi/12.txt --benchmark /home/jwang/data/scData/12.Klein/Klein_cell_label.csv +python -W ignore louvain.py --input othermethods/scvi/13.Zeisel_0.0_1_recon.npy --output otherresults/scvi/13.txt --benchmark /home/jwang/data/scData/13.Zeisel/Zeisel_cell_label.csv diff --git a/results/results.sh b/results/results.sh new file mode 100644 index 0000000..e1f5d8e --- /dev/null +++ b/results/results.sh @@ -0,0 +1,25 @@ +#! /bin/bash +######################### Batch Headers ######################### +#SBATCH -A xulab +#SBATCH -p Lewis,BioCompute # use the BioCompute partition Lewis,BioCompute +#SBATCH -J Louvain +#SBATCH -o results-%j.out # give the job output a custom name +#SBATCH -t 2-00:00 # two days time limit +#SBATCH -N 1 # number of nodes +#SBATCH -n 1 # number of cores (AKA tasks) +#SBATCH --mem=128G +################################################################# +module load miniconda3 +source activate conda_R + +python -W ignore results_tmp.py --inputOri othermethods/saucie/12.Klein_0.0_1_recon.npy --input otherresults/saucie/12.txt --benchmark /home/jwang/data/scData/12.Klein/Klein_cell_label.csv +python -W ignore results_tmp.py --inputOri othermethods/saucie/13.Zeisel_0.0_1_recon.npy --input otherresults/saucie/13.txt --benchmark /home/jwang/data/scData/13.Zeisel/Zeisel_cell_label.csv + +python -W ignore results_tmp.py --inputOri othermethods/scvi/12.Klein_0.0_1_recon.npy --input otherresults/scvi/12.txt --benchmark /home/jwang/data/scData/12.Klein/Klein_cell_label.csv +python -W ignore results_tmp.py --inputOri othermethods/scvi/13.Zeisel_0.0_1_recon.npy --input otherresults/scvi/13.txt --benchmark /home/jwang/data/scData/13.Zeisel/Zeisel_cell_label.csv + +python -W ignore results_tmp.py --inputOri othermethods/netNMFsc/12.Klein_0.0_1_recon.npy --input otherresults/netNMFsc/12.txt --benchmark /home/jwang/data/scData/12.Klein/Klein_cell_label.csv +python -W ignore results_tmp.py --inputOri othermethods/netNMFsc/13.Zeisel_0.0_1_recon.npy --input otherresults/netNMFsc/13.txt --benchmark /home/jwang/data/scData/13.Zeisel/Zeisel_cell_label.csv + +python -W ignore results_tmp.py --inputOri othermethods/scIGANs/12.Klein_0.0_1_recon.npy --input otherresults/scIGANs/12.txt --benchmark /home/jwang/data/scData/12.Klein/Klein_cell_label.csv +python -W ignore results_tmp.py --inputOri othermethods/scIGANs/13.Zeisel_0.0_1_recon.npy --input otherresults/scIGANs/13.txt --benchmark /home/jwang/data/scData/13.Zeisel/Zeisel_cell_label.csv diff --git a/results/results_Reading_recheck.py b/results/results_Reading_recheck.py new file mode 100644 index 0000000..e64b509 --- /dev/null +++ b/results/results_Reading_recheck.py @@ -0,0 +1,312 @@ +import os +import argparse +parser = argparse.ArgumentParser(description='Read Results in different methods') +parser.add_argument('--methodName', type=int, default=0, + help="method used: 0-62") +parser.add_argument('--imputeMode', default=True, action='store_true', + help='impute or not (default: False). Caution: usually change npuDir if set imputeMode as true') +parser.add_argument('--runMode',action='store_true', default=False, help="Run or prepare cluster script") +parser.add_argument('--splitMode', default=False, action='store_true', + help='whether split, used for long queue') +parser.add_argument('--batchStr', type=int, default=0, + help="method used: 1-13") +args = parser.parse_args() + +# New notes: +# We used this in paper revision, will generate lots of .sh files. +# This file is called by submitCluster_Result_Impute_recheck.sh, and only check .out files. +# The results can be get by cat *.out + +# Old Note: +# Generate results in python other than in shell for better organization +# We are not use runpy.run_path('main_result.py') for it is hard to pass arguments +# We are not use subprocess.call("python main_result.py", shell=True) for it runs scripts parallel +# So we use os.system('') here + +if args.splitMode: + #The split of batch, more batches, more parallel + + if args.batchStr == 9: + datasetList = [ + '9.Chung', + # '9.Chung --discreteTag' + ] + elif args.batchStr == 11: + datasetList = [ + '11.Kolodziejczyk', + # '11.Kolodziejczyk --discreteTag' + ] + elif args.batchStr == 12: + datasetList = [ + '12.Klein', + # '12.Klein --discreteTag' + ] + elif args.batchStr == 13: + datasetList = [ + '13.Zeisel', + # '13.Zeisel --discreteTag' + ] +else: + datasetList = [ + '9.Chung', + '11.Kolodziejczyk', + '12.Klein', + '13.Zeisel', + ] + +if args.imputeMode: + pyStr = 'results_impute_graph.py' + + npyList = [ + '../npyImputeG2E_1/ --ratio 0.1', #1 + '../npyImputeG2E_1/ --ratio 0.3', #2 + '../npyImputeG2E_1/ --ratio 0.6', #3 + '../npyImputeG2E_1/ --ratio 0.8', #4 + '../npyImputeG2EL_1/ --ratio 0.1', #5 + '../npyImputeG2EL_1/ --ratio 0.3', #6 + '../npyImputeG2EL_1/ --ratio 0.6', #7 + '../npyImputeG2EL_1/ --ratio 0.8', #8 + '../npyImputeG1E_1/ --ratio 0.1', #9 + '../npyImputeG1E_1/ --ratio 0.3', #10 + '../npyImputeG1E_1/ --ratio 0.6', #11 + '../npyImputeG1E_1/ --ratio 0.8', #12 + '../npyImputeG2F_1/ --ratio 0.1', #13 + '../npyImputeG2F_1/ --ratio 0.3', #14 + '../npyImputeG2F_1/ --ratio 0.6', #15 + '../npyImputeG2F_1/ --ratio 0.8', #16 + '../npyImputeN2E_1/ --ratio 0.1', #17 + '../npyImputeN2E_1/ --ratio 0.3', #18 + '../npyImputeN2E_1/ --ratio 0.6', #19 + '../npyImputeN2E_1/ --ratio 0.8', #20 + + '../npyImputeG2E_2/ --ratio 0.1', #21 + '../npyImputeG2E_2/ --ratio 0.3', #22 + '../npyImputeG2E_2/ --ratio 0.6', #23 + '../npyImputeG2E_2/ --ratio 0.8', #24 + '../npyImputeG2EL_2/ --ratio 0.1', #25 + '../npyImputeG2EL_2/ --ratio 0.3', #26 + '../npyImputeG2EL_2/ --ratio 0.6', #27 + '../npyImputeG2EL_2/ --ratio 0.8', #28 + '../npyImputeG1E_2/ --ratio 0.1', #29 + '../npyImputeG1E_2/ --ratio 0.3', #30 + '../npyImputeG1E_2/ --ratio 0.6', #31 + '../npyImputeG1E_2/ --ratio 0.8', #32 + '../npyImputeG2F_2/ --ratio 0.1', #33 + '../npyImputeG2F_2/ --ratio 0.3', #34 + '../npyImputeG2F_2/ --ratio 0.6', #35 + '../npyImputeG2F_2/ --ratio 0.8', #36 + '../npyImputeN2E_2/ --ratio 0.1', #37 + '../npyImputeN2E_2/ --ratio 0.3', #38 + '../npyImputeN2E_2/ --ratio 0.6', #39 + '../npyImputeN2E_2/ --ratio 0.8', #40 + + '../npyImputeG2E_3/ --ratio 0.1', #41 + '../npyImputeG2E_3/ --ratio 0.3', #42 + '../npyImputeG2E_3/ --ratio 0.6', #43 + '../npyImputeG2E_3/ --ratio 0.8', #44 + '../npyImputeG2EL_3/ --ratio 0.1', #45 + '../npyImputeG2EL_3/ --ratio 0.3', #46 + '../npyImputeG2EL_3/ --ratio 0.6', #47 + '../npyImputeG2EL_3/ --ratio 0.8', #48 + '../npyImputeG1E_3/ --ratio 0.1', #49 + '../npyImputeG1E_3/ --ratio 0.3', #50 + '../npyImputeG1E_3/ --ratio 0.6', #51 + '../npyImputeG1E_3/ --ratio 0.8', #52 + '../npyImputeG2F_3/ --ratio 0.1', #53 + '../npyImputeG2F_3/ --ratio 0.3', #54 + '../npyImputeG2F_3/ --ratio 0.6', #55 + '../npyImputeG2F_3/ --ratio 0.8', #56 + '../npyImputeN2E_3/ --ratio 0.1', #57 + '../npyImputeN2E_3/ --ratio 0.3', #58 + '../npyImputeN2E_3/ --ratio 0.6', #59 + '../npyImputeN2E_3/ --ratio 0.8', #60 + + ] + +else: + pyStr = 'results_celltype.py' + + npyList = [ + '../npyG1B/', #0 + '../npyG1E/', #1 + '../npyG1F/', #2 + '../npyR1B/', #3 + '../npyR1E/', #4 + '../npyR1F/', #5 + '../npyN1B/', #6 + '../npyN1E/', #7 + '../npyN1F/', #8 + '../npyG2B/', #9 + '../npyG2E/', #10 + '../npyG2F/', #11 + '../npyR2B/', #12 + '../npyR2E/', #13 + '../npyR2F/', #14 + '../npyN2B/', #15 + '../npyN2E/', #16 + '../npyN2F/', #17 + + '../npyG1B_LK/', #18 + '../npyG1E_LK/', #19 + '../npyG1F_LK/', #20 + '../npyR1B_LK/', #21 + '../npyR1E_LK/', #22 + '../npyR1F_LK/', #23 + '../npyN1B_LK/', #24 + '../npyN1E_LK/', #25 + '../npyN1F_LK/', #26 + '../npyG2B_LK/', #27 + '../npyG2E_LK/', #28 + '../npyG2F_LK/', #29 + '../npyR2B_LK/', #30 + '../npyR2E_LK/', #31 + '../npyR2F_LK/', #32 + '../npyN2B_LK/', #33 + '../npyN2E_LK/', #34 + '../npyN2F_LK/', #35 + + '../npyG1B_LB/', #36 + '../npyG1E_LB/', #37 + '../npyG1F_LB/', #38 + '../npyR1B_LB/', #39 + '../npyR1E_LB/', #40 + '../npyR1F_LB/', #41 + '../npyN1B_LB/', #42 + '../npyN1E_LB/', #43 + '../npyN1F_LB/', #44 + '../npyG2B_LB/', #45 + '../npyG2E_LB/', #46 + '../npyG2F_LB/', #47 + '../npyR2B_LB/', #48 + '../npyR2E_LB/', #49 + '../npyR2F_LB/', #50 + '../npyN2B_LB/', #51 + '../npyN2E_LB/', #52 + '../npyN2F_LB/', #53 + ] + +reguDict={} + +for i in range(0,16): + reguDict[i]='LTMG' +for i in range(16,20): + reguDict[i]='noregu' +for i in range(20,36): + reguDict[i]='LTMG' +for i in range(36,40): + reguDict[i]='noregu' +for i in range(40,56): + reguDict[i]='LTMG' +for i in range(56,60): + reguDict[i]='noregu' + +reguStr='' +if args.methodName in reguDict: + reguStr=' --regulized-type ' + reguDict[args.methodName] + ' ' + +npyStr = npyList[args.methodName] + +benchmarkStr = '' + +if args.runMode: + labelFileDir = '/home/wangjue/biodata/scData/allBench/' +else: + labelFileDir = '/home/jwang/data/scData/' + +def getBenchmarkStr(count): + benchmarkStr = '' + if args.batchStr == 1: + benchmarkStr = ' --benchmark '\ + '--labelFilename ' + labelFileDir + '1.Biase/Biase_cell_label.csv '\ + '--n-clusters 3 ' + elif args.batchStr == 2: + benchmarkStr = ' --benchmark '\ + '--labelFilename ' + labelFileDir + '2.Li/Li_cell_label.csv '\ + '--n-clusters 9 ' + elif args.batchStr == 3: + benchmarkStr = ' --benchmark '\ + '--labelFilename ' + labelFileDir + '3.Treutlein/Treutlein_cell_label.csv '\ + '--n-clusters 5 ' + elif args.batchStr == 4: + benchmarkStr = ' --benchmark '\ + '--labelFilename ' + labelFileDir + '4.Yan/Yan_cell_label.csv '\ + '--n-clusters 7 ' + elif args.batchStr == 5: + benchmarkStr = ' --benchmark '\ + '--labelFilename ' + labelFileDir + '5.Goolam/Goolam_cell_label.csv '\ + '--n-clusters 5 ' + elif args.batchStr == 6: + benchmarkStr = ' --benchmark '\ + '--labelFilename ' + labelFileDir + '6.Guo/Guo_cell_label.csv '\ + '--n-clusters 9 ' + elif args.batchStr == 7: + benchmarkStr = ' --benchmark '\ + '--labelFilename ' + labelFileDir + '7.Deng/Deng_cell_label.csv '\ + '--n-clusters 10 ' + elif args.batchStr == 8: + benchmarkStr = ' --benchmark '\ + '--labelFilename ' + labelFileDir + '8.Pollen/Pollen_cell_label.csv '\ + '--n-clusters 11 ' + elif args.batchStr == 9: + benchmarkStr = ' --benchmark '\ + '--labelFilename ' + labelFileDir + '9.Chung/Chung_cell_label.csv '\ + '--n-clusters 4 ' + elif args.batchStr == 10: + benchmarkStr = ' --benchmark '\ + '--labelFilename ' + labelFileDir + '10.Usoskin/Usoskin_cell_label.csv '\ + '--n-clusters 11 ' + elif args.batchStr == 11: + benchmarkStr = ' --benchmark '\ + '--labelFilename ' + labelFileDir + '11.Kolodziejczyk/Kolodziejczyk_cell_label.csv '\ + '--n-clusters 3 ' + elif args.batchStr == 12: + benchmarkStr = ' --benchmark '\ + '--labelFilename ' + labelFileDir + '12.Klein/Klein_cell_label.csv '\ + '--n-clusters 4 ' + elif args.batchStr == 13: + benchmarkStr = ' --benchmark '\ + '--labelFilename ' + labelFileDir + '13.Zeisel/Zeisel_cell_label.csv '\ + '--n-clusters 7 ' + + return benchmarkStr + + +if not args.runMode: + if args.imputeMode: + imputeStr = 'I' + else: + imputeStr = 'C' + splitStr = '' + if args.splitMode: + splitStr = '_'+str(args.batchStr) + templateStr = "#! /bin/bash\n"\ + "######################### Batch Headers #########################\n"\ + "#SBATCH -A xulab\n"\ + "#SBATCH -p Lewis,BioCompute # use the BioCompute partition\n"\ + "#SBATCH -J R" + imputeStr + '_' + str(args.methodName) + splitStr + " \n"\ + "#SBATCH -o results-%j.out # give the job output a custom name\n"\ + "#SBATCH -t 2-00:00 # two days time limit\n"\ + "#SBATCH -N 1 # number of nodes\n"\ + "#SBATCH -n 1 # number of cores (AKA tasks)\n"\ + "#SBATCH --mem=128G\n"\ + "#################################################################\n"\ + "module load miniconda3\n"\ + "source activate conda_R\n" + print(templateStr) + +count = 0 +for datasetStr in datasetList: + commandStr = 'python -W ignore ' + pyStr + ' --datasetName ' + datasetStr + reguStr + getBenchmarkStr(count) + ' --npyDir ' + npyStr + if args.runMode: + os.system(commandStr) + else: + print(commandStr) + # for i in range(10): + # commandStr = 'python -W ignore ' + pyStr + ' --datasetName ' + datasetStr + reguStr + getBenchmarkStr(count) + ' --reconstr '+ str(i) + ' --npyDir ' + npyStr + # if args.runMode: + # os.system(commandStr) + # else: + # print(commandStr) + count += 1 + + diff --git a/results/results_impute_graph.py b/results/results_impute_graph.py index 4145dd1..3f33f3c 100644 --- a/results/results_impute_graph.py +++ b/results/results_impute_graph.py @@ -63,20 +63,19 @@ # dropi = np.load(args.npyDir+args.datasetName+'_'+args.regulized_type+discreteStr+'_'+args.ratio+'_0.0-0.3-0.1_dropi.npy') # dropj = np.load(args.npyDir+args.datasetName+'_'+args.regulized_type+discreteStr+'_'+args.ratio+'_0.0-0.3-0.1_dropj.npy') # dropix = np.load(args.npyDir+args.datasetName+'_'+args.regulized_type+discreteStr+'_'+args.ratio+'_0.0-0.3-0.1_dropix.npy') -dropi = np.load(args.npyDir+args.datasetName+'_'+args.regulized_type+discreteStr+'_'+args.ratio+'_10-0.1-0.9-'+args.regupara+'_dropi.npy') -dropj = np.load(args.npyDir+args.datasetName+'_'+args.regulized_type+discreteStr+'_'+args.ratio+'_10-0.1-0.9-'+args.regupara+'_dropj.npy') -dropix = np.load(args.npyDir+args.datasetName+'_'+args.regulized_type+discreteStr+'_'+args.ratio+'_10-0.1-0.9-'+args.regupara+'_dropix.npy') - - +dropi = np.load(args.npyDir+args.datasetName+'_'+args.regulized_type+discreteStr+'_'+args.ratio+'_10-0.1-0.9-0.0-0.3-'+args.regupara+'_dropi.npy') +dropj = np.load(args.npyDir+args.datasetName+'_'+args.regulized_type+discreteStr+'_'+args.ratio+'_10-0.1-0.9-0.0-0.3-'+args.regupara+'_dropj.npy') +dropix = np.load(args.npyDir+args.datasetName+'_'+args.regulized_type+discreteStr+'_'+args.ratio+'_10-0.1-0.9-0.0-0.3-'+args.regupara+'_dropix.npy') # featuresImpute = np.load(args.npyDir+args.datasetName+'_'+args.regulized_type+discreteStr+'_'+args.ratio+'_'+args.regupara+'_recon'+args.reconstr+'.npy') -featuresImpute = np.load(args.npyDir+args.datasetName+'_'+args.regulized_type+discreteStr+'_'+args.ratio+'_10-0.1-0.9-'+args.regupara+'_recon'+args.reconstr+'.npy') +featuresImpute = np.load(args.npyDir+args.datasetName+'_'+args.regulized_type+discreteStr+'_'+args.ratio+'_10-0.1-0.9-0.0-0.3-'+args.regupara+'_recon'+args.reconstr+'.npy') # featuresImpute = np.load(args.npyDir+args.datasetName+'_'+args.regulized_type+discreteStr+'_'+args.ratio+'_0.0-0.3-0.1_recon'+args.reconstr+'.npy') # featuresImpute = pd.read_csv(args.npyDir+args.datasetName+'_'+args.regulized_type+discreteStr+'_'+args.regupara+'_0.0_0.0_recon'+args.reconstr+'.csv') # featuresImpute = featuresImpute.to_numpy() -l1ErrorMean, l1ErrorMedian, l1ErrorMin, l1ErrorMax = imputation_error_log(featuresImpute, featuresOriginal, features, dropi, dropj, dropix) -print('{:.4f} {:.4f} {:.4f} {:.4f} '.format(l1ErrorMean, l1ErrorMedian, l1ErrorMin, l1ErrorMax), end='') +l1ErrorMean, l1ErrorMedian, l1ErrorMin, l1ErrorMax, rmse = imputation_error_log(featuresImpute, featuresOriginal, features, dropi, dropj, dropix) +cosine = imputation_cosine_log(featuresImpute, featuresOriginal, features, dropi, dropj, dropix) +print('{:.4f} {:.4f} {:.4f} {:.4f} {:.4f} {:.4f} '.format(l1ErrorMean, l1ErrorMedian, l1ErrorMin, l1ErrorMax, cosine, rmse), end='') def imputeResult(inputData): ''' diff --git a/results/results_impute_others_all.py b/results/results_impute_others_all.py new file mode 100644 index 0000000..c2b6d9f --- /dev/null +++ b/results/results_impute_others_all.py @@ -0,0 +1,76 @@ +import os +import numpy as np +import pandas as pd +import argparse +import scipy.sparse +import sys +sys.path.append('../') +from util_function import * +from benchmark_util import * +from R_util import generateLouvainCluster +from sklearn.cluster import KMeans +import argparse +parser = argparse.ArgumentParser(description='Read Results in different methods') +args = parser.parse_args() + +# Notes: +# In HPC, call by sbatch submit_Impute_others.sh + +datasetList = [ + '9.Chung', + '11.Kolodziejczyk', + '12.Klein', + '13.Zeisel', +] + +oridirStr = '../npyImputeG2E' +medirStr = '../' + +seedList = ['1','2','3'] +ratioList = ['0.1','0.3','0.6','0.8'] + +# sophisticated, not using +# methodList = ['magic','saucie','saver','scimpute','scvi','scvinorm','dca','deepimpute','scIGANslog','scIGANs','netNMFsclog','netNMFsc'] + +# We should use only log(x+1) if the method permitted +methodList = ['magic','saucie','saver','scimpute','scvi','scvinorm','dca','deepimpute','scIGANs','netNMFsc'] + +def outResults(datasetName,seed,ratio,method): + featuresOriginal = load_data(datasetName, discreteTag=False) + + features = None + dropi = np.load(oridirStr+'_'+seed+'/'+datasetName+'_LTMG_'+ratio+'_10-0.1-0.9-0.0-0.3-0.1_dropi.npy') + dropj = np.load(oridirStr+'_'+seed+'/'+datasetName+'_LTMG_'+ratio+'_10-0.1-0.9-0.0-0.3-0.1_dropj.npy') + dropix = np.load(oridirStr+'_'+seed+'/'+datasetName+'_LTMG_'+ratio+'_10-0.1-0.9-0.0-0.3-0.1_dropix.npy') + + # scGNN results + # featuresImpute = np.load(npyDir+datasetName+'_'+args.regulized_type+discreteStr+'_'+args.ratio+'_10-0.1-0.9-0.0-0.3-'+args.regupara+'_recon'+args.reconstr+'.npy') + if method == 'scvinorm': + featuresImpute = np.load(medirStr+'scvi/'+datasetName+'_'+ratio+'_'+seed+'_recon_normalized.npy') + # not using now + elif method == 'scIGANs': + df = pd.read_csv('/storage/htc/joshilab/jghhd/singlecellTest/scIGAN/Result_200_'+ratio+'/'+datasetName+'/scIGANs_npyImputeG2E_'+seed+'_'+datasetName+'_LTMG_'+ratio+'_10-0.1-0.9-0.0-0.3-0.1_features_log.csv_'+datasetName.split('.')[1]+'_only_label.csv.txt',sep='\s+',index_col=0) + tmp = df.to_numpy() + featuresImpute = tmp.T + elif method == 'netNMFsc': + featuresImpute = np.load('/storage/htc/joshilab/jghhd/singlecellTest/netNMFsc/result_mi_100000/'+ratio+'/'+datasetName+'/npyImputeG2E_'+seed+'_log_imputation.npy') + featuresImpute = featuresImpute.T + else: + featuresImpute = np.load(medirStr+method+'/'+datasetName+'_'+ratio+'_'+seed+'_recon.npy') + + # No log + if method=='dca' or method=='deepimpute': + l1ErrorMean, l1ErrorMedian, l1ErrorMin, l1ErrorMax, rmse = imputation_error(featuresImpute, featuresOriginal, features, dropi, dropj, dropix) + cosine = imputation_cosine(featuresImpute, featuresOriginal, features, dropi, dropj, dropix) + # log + else: + l1ErrorMean, l1ErrorMedian, l1ErrorMin, l1ErrorMax, rmse = imputation_error_log(featuresImpute, featuresOriginal, features, dropi, dropj, dropix) + cosine = imputation_cosine_log(featuresImpute, featuresOriginal, features, dropi, dropj, dropix) + print('{:.4f} {:.4f} {:.4f} {:.4f} {:.4f} {:.4f}'.format(l1ErrorMean, l1ErrorMedian, l1ErrorMin, l1ErrorMax, cosine, rmse)) + + +for method in methodList: + for datasetName in datasetList: + for seed in seedList: + for ratio in ratioList: + outResults(datasetName=datasetName, seed=seed, ratio=ratio, method=method) \ No newline at end of file diff --git a/results/results_tmp.py b/results/results_tmp.py new file mode 100644 index 0000000..97aab4d --- /dev/null +++ b/results/results_tmp.py @@ -0,0 +1,31 @@ +import os, sys +sys.path.append(os.path.join(os.path.dirname(os.path.realpath(__file__)), os.pardir)) +# sys.path.append('../') +import numpy as np +from util_function import * +from graph_function import * +import argparse + +parser = argparse.ArgumentParser(description='main benchmark for scRNA with timer and mem') +#Benchmark related +parser.add_argument('--benchmark', type=str, default='/home/jwang/data/scData/13.Zeisel/Zeisel_cell_label.csv', + help='the benchmark file of celltype (default: /home/jwang/data/scData/13.Zeisel/Zeisel_cell_label.csv)') +parser.add_argument('--input', type=str, default='filename', + help='input filename') +parser.add_argument('--inputOri', type=str, default='filename', + help='input filename') +args = parser.parse_args() + +#Benchmark +bench_pd=pd.read_csv(args.benchmark,index_col=0) +bench_celltype=bench_pd.iloc[:,0].to_numpy() + + +#'saucie/13.txt' +z_pd = pd.read_csv(args.input,header=None) +listResult = z_pd.iloc[:,0].to_numpy() +zOut = np.load(args.inputOri,allow_pickle=True) +silhouette, chs, dbs = measureClusteringNoLabel(zOut, listResult) +ari, ami, nmi, cs, fms, vms, hs = measureClusteringTrueLabel(bench_celltype, listResult) +resultstr = str(silhouette)+' '+str(chs)+' '+str(dbs)+' '+str(ari)+' '+str(ami)+' '+str(nmi)+' '+str(cs)+' '+str(fms)+' '+str(vms)+' '+str(hs) +print(resultstr) diff --git a/results/submitCluster_Result_Impute_recheck.sh b/results/submitCluster_Result_Impute_recheck.sh new file mode 100644 index 0000000..157350a --- /dev/null +++ b/results/submitCluster_Result_Impute_recheck.sh @@ -0,0 +1,17 @@ +for i in {0..59} +do +for j in {9,11,12,13} +do +python results_Reading_recheck.py --methodName $i --splitMode --batchStr $j > run_Results_Impute_$i-$j.sh +done +done + +# submit +for j in {9,11,12,13} +do +for i in {0..59} +do +sbatch run_Results_Impute_$i-$j.sh +sleep 1 +done +done \ No newline at end of file diff --git a/results/submit_Impute_others.sh b/results/submit_Impute_others.sh new file mode 100644 index 0000000..55e89f4 --- /dev/null +++ b/results/submit_Impute_others.sh @@ -0,0 +1,14 @@ +#! /bin/bash +######################### Batch Headers ######################### +#SBATCH -A xulab +#SBATCH -p Lewis,BioCompute # use the BioCompute partition Lewis,BioCompute +#SBATCH -J OthersResults +#SBATCH -o results-%j.out # give the job output a custom name +#SBATCH -t 2-00:00 # two days time limit +#SBATCH -N 1 # number of nodes +#SBATCH -n 1 # number of cores (AKA tasks) +#SBATCH --mem=128G +################################################################# +module load miniconda3 +source activate conda_R +python3 -W ignore results_impute_others_all.py \ No newline at end of file diff --git a/results/zeroPercentage.py b/results/zeroPercentage.py new file mode 100644 index 0000000..cef85e8 --- /dev/null +++ b/results/zeroPercentage.py @@ -0,0 +1,24 @@ +#Calculate Zero percentage in each of the datasets +import numpy as np + +def calcu(dataset='9.Chung',ratio=0.0): + t=np.load('npyImputeG2E_1/{}_LTMG_{}_10-0.1-0.9-0.0-0.3-0.1_features.npy'.format(dataset,ratio),allow_pickle=True) + t=t.tolist() + t=t.todense() + zeroNum = np.where(t==0)[0].shape[0] + allNum = t.shape[0]*t.shape[1] + percent = zeroNum/allNum + print('{} {} {}'.format(zeroNum,allNum,percent)) + +datasetList = [ + '9.Chung', + '11.Kolodziejczyk', + '12.Klein', + '13.Zeisel', +] + +ratioList = ['0.0','0.1','0.3','0.6','0.8'] + +for dataset in datasetList: + for ratio in ratioList: + calcu(dataset, ratio) \ No newline at end of file diff --git a/scGNN.py b/scGNN.py index 6f685a3..b4d5e7a 100644 --- a/scGNN.py +++ b/scGNN.py @@ -755,4 +755,6 @@ def train(epoch, train_loader=train_loader, EMFlag=False, taskType='celltype', s results_df = pd.DataFrame(listResult,index=celllist,columns=["Celltype"]) results_df.to_csv(args.outputDir+args.datasetName+'_'+args.regulized_type+'_'+str(args.alphaRegularizePara)+'_'+str(args.L1Para)+'_'+str(args.L2Para)+'_results.txt') + mem=resource.getrusage(resource.RUSAGE_SELF).ru_maxrss + print('Mem consumption: '+str(mem)) print('---'+str(datetime.timedelta(seconds=int(time.time()-start_time)))+"---scGNN finished") diff --git a/scripts/choose_louvain.py b/scripts/choose_louvain.py new file mode 100644 index 0000000..42cbe0c --- /dev/null +++ b/scripts/choose_louvain.py @@ -0,0 +1,104 @@ +# Script to test efficiency of louvain + +# Option 1: Original version, use r version of louvain, it takes time to link R, and need install rpy2. +# Not use anymore +# Clustering is different between Case one and two +import pandas as pd +import rpy2.robjects as ro +from rpy2.robjects.packages import importr +from rpy2.robjects import r, pandas2ri +pandas2ri.activate() + +# case one: +edgeList = [] +edgeList.append((0,2,1.0)) +edgeList.append((1,2,1.0)) +edgeList.append((2,3,1.0)) +edgeList.append((3,4,1.0)) +edgeList.append((4,5,1.0)) +edgeList.append((4,6,1.0)) + +# case two: +edgeList.append((0,2,1.0)) +edgeList.append((1,2,1.0)) +edgeList.append((2,3,0.1)) +edgeList.append((3,4,1.0)) +edgeList.append((4,5,1.0)) +edgeList.append((4,6,1.0)) + +fromVec = [] +toVec = [] +weightVec = [] +for edge in edgeList: + fromVec.append(edge[0]) + toVec.append(edge[1]) + weightVec.append(edge[2]) + +igraph = importr('igraph') +base = importr('base') +fromV = ro.FloatVector(fromVec) +toV = ro.FloatVector(toVec) +# weightV= ro.FloatVector([0.1,1.0,1.0,0.1,1.0]) +weightV= ro.FloatVector(weightVec) +links = ro.DataFrame({'from':fromV,'to':toV,'weight':weightV}) +g = igraph.graph_from_data_frame(links,directed = False) +cl = igraph.cluster_louvain(g) + +def as_dict(vector): + """Convert an RPy2 ListVector to a Python dict""" + result = {} + for i, name in enumerate(vector.names): + if isinstance(vector[i], ro.ListVector): + result[name] = as_dict(vector[i]) + elif len(vector[i]) == 1: + result[name] = vector[i][0] + else: + result[name] = vector[i] + return result + +cl_dict = as_dict(cl) +df = pd.DataFrame() +# df['Cluster']=cl_dict['membership'] +size = float(len(set(cl_dict['membership']))) + +listResult=[] +count = 0 +for i in range(len(cl_dict['membership'])): + listResult.append(int(cl_dict['membership'][i])-1) + count += 1 + +# Option 2: use package python-louvain, but does not work +# Clustering is identical between Case one and two, so we cannot use it +import networkx as nx +import community as community_louvain +G = nx.Graph() +G.add_weighted_edges_from(edgeList) +partition = community_louvain.best_partition(G,weight='weight') + + +# Option 3: use igraph, pure python and looks right +# Clustering is identical between Case one and two, so we cannot use it +import numpy as np +from igraph import * +#Case 1: +W=np.zeros((7,7)) +W[0,2]=1.0 +W[1,2]=1.0 +W[2,3]=1.0 +W[3,4]=1.0 +W[4,5]=1.0 +W[4,6]=1.0 + +#Case 2: +W=np.zeros((7,7)) +W[0,2]=1.0 +W[1,2]=1.0 +W[2,3]=0.1 +W[3,4]=1.0 +W[4,5]=1.0 +W[4,6]=1.0 + +graph = Graph.Weighted_Adjacency(W.tolist(), mode=ADJ_UNDIRECTED, attr="weight", loops=False) +louvain_partition = graph.community_multilevel(weights=graph.es['weight'], return_levels=False) +print(louvain_partition) + diff --git a/submitCluster_distribution.sh b/submitCluster_distribution.sh new file mode 100644 index 0000000..e36b7ec --- /dev/null +++ b/submitCluster_distribution.sh @@ -0,0 +1,32 @@ +#submit plotting + +for i in {0.1,0.3,0.6,0.8} +do +sbatch plot_G2E_$i\_9.sh +sbatch plot_G2E_$i\_11.sh +sbatch plot_G2E_$i\_12.sh +sbatch plot_G2E_$i\_13.sh +done + +# for i in {0.1,0.3,0.6,0.8} +# do +# sbatch plot_G2EL_$i\_9.sh +# sbatch plot_G1E_$i\_9.sh +# sbatch plot_G2F_$i\_9.sh +# sbatch plot_N2E_$i\_9.sh + +# sbatch plot_G2EL_$i\_11.sh +# sbatch plot_G1E_$i\_11.sh +# sbatch plot_G2F_$i\_11.sh +# sbatch plot_N2E_$i\_11.sh + +# sbatch plot_G2EL_$i\_12.sh +# sbatch plot_G1E_$i\_12.sh +# sbatch plot_G2F_$i\_12.sh +# sbatch plot_N2E_$i\_12.sh + +# sbatch plot_G2EL_$i\_13.sh +# sbatch plot_G1E_$i\_13.sh +# sbatch plot_G2F_$i\_13.sh +# sbatch plot_N2E_$i\_13.sh +# done \ No newline at end of file diff --git a/submitCluster_imputation_0.0.sh b/submitCluster_imputation_0.0.sh new file mode 100644 index 0000000..5dcd876 --- /dev/null +++ b/submitCluster_imputation_0.0.sh @@ -0,0 +1,4 @@ +sbatch run_experimentImpute_2_g_e_1_9_0.0.sh +sbatch run_experimentImpute_2_g_e_1_11_0.0.sh +sbatch run_experimentImpute_2_g_e_1_12_0.0.sh +sbatch run_experimentImpute_2_g_e_1_13_0.0.sh diff --git a/submitCluster_imputation_0.1-0.8-ablation.sh b/submitCluster_imputation_0.1-0.8-ablation.sh new file mode 100644 index 0000000..a822d20 --- /dev/null +++ b/submitCluster_imputation_0.1-0.8-ablation.sh @@ -0,0 +1,47 @@ +mkdir npyImputeG2E_1 +mkdir npyImputeG2EL_1 +mkdir npyImputeG2F_1 +mkdir npyImputeN2E_1 +mkdir npyImputeG1E_1 + +mkdir npyImputeG2E_2 +mkdir npyImputeG2EL_2 +mkdir npyImputeG2F_2 +mkdir npyImputeN2E_2 +mkdir npyImputeG1E_2 + +mkdir npyImputeG2E_3 +mkdir npyImputeG2EL_3 +mkdir npyImputeG2F_3 +mkdir npyImputeN2E_3 +mkdir npyImputeG1E_3 + +for i in {1..3} +do +for j in {0.1,0.3,0.6,0.8} +do +sbatch run_experimentImpute_1_g_e_$i\_9_$j\.sh +sbatch run_experimentImpute_2_g_e_$i\_9_$j\.sh +sbatch run_experimentImpute_2_g_e_L_$i\_9_$j\.sh +sbatch run_experimentImpute_2_g_f_$i\_9_$j\.sh +sbatch run_experimentImpute_2_n_e_$i\_9_$j\.sh + +sbatch run_experimentImpute_1_g_e_$i\_11_$j\.sh +sbatch run_experimentImpute_2_g_e_$i\_11_$j\.sh +sbatch run_experimentImpute_2_g_e_L_$i\_11_$j\.sh +sbatch run_experimentImpute_2_g_f_$i\_11_$j\.sh +sbatch run_experimentImpute_2_n_e_$i\_11_$j\.sh + +sbatch run_experimentImpute_1_g_e_$i\_12_$j\.sh +sbatch run_experimentImpute_2_g_e_$i\_12_$j\.sh +sbatch run_experimentImpute_2_g_e_L_$i\_12_$j\.sh +sbatch run_experimentImpute_2_g_f_$i\_12_$j\.sh +sbatch run_experimentImpute_2_n_e_$i\_12_$j\.sh + +sbatch run_experimentImpute_1_g_e_$i\_13_$j\.sh +sbatch run_experimentImpute_2_g_e_$i\_13_$j\.sh +sbatch run_experimentImpute_2_g_e_L_$i\_13_$j\.sh +sbatch run_experimentImpute_2_g_f_$i\_13_$j\.sh +sbatch run_experimentImpute_2_n_e_$i\_13_$j\.sh +done +done \ No newline at end of file diff --git a/util_function.py b/util_function.py index 103b585..f159d2a 100644 --- a/util_function.py +++ b/util_function.py @@ -65,13 +65,23 @@ def load_data(datasetName, discreteTag): names = ['x', 'tx', 'allx'] objects = [] for i in range(len(names)): - with open(dir_path+"/data/sc/{}/ind.{}.{}".format(datasetName, datasetName, names[i]), 'rb') as f: + #windows + if os.name=='nt': + filename = dir_path+"\\data\\sc\\{}\\ind.{}.{}".format(datasetName, datasetName, names[i]) + else: + filename = dir_path+"/data/sc/{}/ind.{}.{}".format(datasetName, datasetName, names[i]) + with open(filename, 'rb') as f: if sys.version_info > (3, 0): objects.append(pkl.load(f, encoding='latin1')) else: objects.append(pkl.load(f)) x, tx, allx = tuple(objects) - test_idx_reorder = parse_index_file(dir_path+"/data/sc/{}/ind.{}.test.index".format(datasetName, datasetName)) + #windows + if os.name == 'nt': + filename = dir_path+"\\data\\sc\\{}\\ind.{}.test.index".format(datasetName, datasetName) + else: + filename = dir_path+"/data/sc/{}/ind.{}.test.index".format(datasetName, datasetName) + test_idx_reorder = parse_index_file(filename) test_idx_range = np.sort(test_idx_reorder) if datasetName == 'citeseer': @@ -199,6 +209,48 @@ def __getitem__(self, idx): return sample,idx +class scDatasetDropoutSparse(Dataset): + def __init__(self, data=None, discreteTag=False, ratio=0.1, seed=1, transform=None): + """ + Args: + Sparse + datasetName (String): TGFb, etc. + transform (callable, optional): + """ + + self.featuresOriginal = data.transpose() + self.ratio = ratio + # Random seed + # np.random.uniform(1, 2) + self.features, self.i, self.j, self.ix = impute_dropout(self.featuresOriginal, seed=seed, rate=self.ratio) + # Now lines are cells, and cols are genes + # self.features = self.features.transpose() + self.transform = transform + # check whether log or not + self.discreteTag = discreteTag + + def __len__(self): + return self.features.shape[0] + + def __getitem__(self, idx): + if torch.is_tensor(idx): + idx = idx.tolist() + + sample = self.features[idx,:] + if type(sample)==sp.lil_matrix: + sample = torch.from_numpy(sample.toarray()) + else: + sample = torch.from_numpy(sample) + + # transform after get the data + if self.transform: + sample = self.transform(sample) + + if not self.discreteTag: + sample = torch.log(sample+1) + + return sample,idx + class scDataset(Dataset): def __init__(self, data=None, transform=None): """