diff --git a/hca_bone_marrow_data_analysis/geneexpr/code/01_f.R b/hca_bone_marrow_data_analysis/geneexpr/code/01_f.R
deleted file mode 100644
index 7f6341d..0000000
--- a/hca_bone_marrow_data_analysis/geneexpr/code/01_f.R
+++ /dev/null
@@ -1,14 +0,0 @@
-setwd('/home-4/whou10@jhu.edu/scratch/Wenpin/trajectory_variability/')
-order = readRDS('./hca/result/ery/order.rds')
-mat = readRDS('./hca/data/HCA/proc/matrix/saver.rds')
-mat = mat[,order$Cell]
-source('./function/01_function.R')
-order = data.frame(order, Patient = gsub('_.*','', order$Cell))
-ap = as.character(unique(order$Patient))
-g1 = ap[grepl('female', ap)]
-g2 = ap[grepl(':male', ap)]
-f_gene = f_statistics_from_gene(mat, order, g1, g2)
-saveRDS(f_gene,'./hca/geneexpr/result/f_statistics_from_gene_gender.rds')
-
-a = f_statistics_from_gene_permute(mat, order, g1, g2, num.permute=1e4)
-saveRDS(a,'./hca/geneexpr/result/f_statistics_from_gene_gender_permute.rds')
diff --git a/hca_bone_marrow_data_analysis/geneexpr/code/01_f2.R b/hca_bone_marrow_data_analysis/geneexpr/code/01_f2.R
deleted file mode 100644
index f9a2993..0000000
--- a/hca_bone_marrow_data_analysis/geneexpr/code/01_f2.R
+++ /dev/null
@@ -1,15 +0,0 @@
-setwd('/home-4/whou10@jhu.edu/scratch/Wenpin/trajectory_variability/')
-order = readRDS('./hca/result/ery/order.rds')
-mat = readRDS('./hca/data/HCA/proc/matrix/saver.rds')
-mat = mat[,order$Cell]
-source('./function/01_function.R')
-order = data.frame(order, Patient = gsub('_.*','', order$Cell))
-ap = as.character(unique(order$Patient))
-g1 = ap[grepl('female', ap)]
-g2 = ap[grepl(':male', ap)]
-f_gene = f_statistics_from_gene(mat, order, g1, g2)
-saveRDS(f_gene,'./hca/geneexpr/result/f_statistics_from_gene_gender.rds')
-
-mat <- mat[rowMeans(mat>0.01)>0.1, ]
-a = f_statistics_from_gene_permute(mat, order, g1, g2, num.permute=1e4)
-saveRDS(a,'./hca/geneexpr/result/f_statistics_from_lowExprGene_gender_permute.rds')
diff --git a/hca_bone_marrow_data_analysis/geneexpr/code/01_f3.R b/hca_bone_marrow_data_analysis/geneexpr/code/01_f3.R
deleted file mode 100644
index b8338b1..0000000
--- a/hca_bone_marrow_data_analysis/geneexpr/code/01_f3.R
+++ /dev/null
@@ -1,14 +0,0 @@
-setwd('/home-4/whou10@jhu.edu/scratch/Wenpin/trajectory_variability/')
-order = readRDS('./hca/result/ery/order.rds')
-mat = readRDS('./hca/data/HCA/proc/matrix/saver.rds')
-mat = mat[,order$Cell]
-source('./function/01_function.R')
-order = data.frame(order, Patient = gsub('_.*','', order$Cell))
-ap = as.character(unique(order$Patient))
-g1 = ap[grepl('female', ap)]
-g2 = ap[grepl(':male', ap)]
-f_gene = f_statistics_from_gene(mat, order, g1, g2)
-saveRDS(f_gene,'./hca/geneexpr/result/f_statistics_from_gene_gender.rds')
-mat <- mat[rowMeans(mat>0.01)>0.1, ]
-a = f_statistics_from_gene_permute(mat, order, g1, g2, num.permute=1e4)
-saveRDS(a,'./hca/geneexpr/result/f_statistics_from_lowExprGene_gender_permute_new1e4.rds')
diff --git a/hca_bone_marrow_data_analysis/geneexpr/code/02_f_age.R b/hca_bone_marrow_data_analysis/geneexpr/code/02_f_age.R
deleted file mode 100644
index 4f45783..0000000
--- a/hca_bone_marrow_data_analysis/geneexpr/code/02_f_age.R
+++ /dev/null
@@ -1,16 +0,0 @@
-setwd('/home-4/whou10@jhu.edu/scratch/Wenpin/trajectory_variability/')
-order = readRDS('./hca/result/ery/order.rds')
-mat = readRDS('./hca/data/HCA/proc/matrix/saver.rds')
-mat = mat[,order$Cell]
-source('./function/01_function.R')
-order = data.frame(order, Patient = gsub('_.*','', order$Cell))
-ap = as.character(unique(order$Patient))
-age = as.numeric(sapply(ap, function(i) strsplit(i,':')[[1]][2]))
-g1 = ap[order(age)[1:4]]
-g2 = ap[order(age)[5:8]]
-f_gene = f_statistics_from_gene(mat, order, g1, g2)
-saveRDS(f_gene,'./hca/geneexpr/result/f_statistics_from_gene_age.rds')
-
-mat <- mat[rowMeans(mat>0.01)>0.1, ]
-a = f_statistics_from_gene_permute(mat, order, g1, g2, num.permute=1e4)
-saveRDS(a,'./hca/geneexpr/result/f_statistics_from_lowExprGene_age_permute.rds')
diff --git a/hca_bone_marrow_data_analysis/geneexpr/code/02_f_age3.R b/hca_bone_marrow_data_analysis/geneexpr/code/02_f_age3.R
deleted file mode 100644
index 7326ae4..0000000
--- a/hca_bone_marrow_data_analysis/geneexpr/code/02_f_age3.R
+++ /dev/null
@@ -1,16 +0,0 @@
-setwd('/home-4/whou10@jhu.edu/scratch/Wenpin/trajectory_variability/')
-order = readRDS('./hca/result/ery/order.rds')
-mat = readRDS('./hca/data/HCA/proc/matrix/saver.rds')
-mat = mat[,order$Cell]
-source('./function/01_function.R')
-order = data.frame(order, Patient = gsub('_.*','', order$Cell))
-ap = as.character(unique(order$Patient))
-age = as.numeric(sapply(ap, function(i) strsplit(i,':')[[1]][2]))
-g1 = ap[order(age)[1:4]]
-g2 = ap[order(age)[5:8]]
-f_gene = f_statistics_from_gene(mat, order, g1, g2)
-saveRDS(f_gene,'./hca/geneexpr/result/f_statistics_from_gene_age.rds')
-
-mat <- mat[rowMeans(mat>0.01)>0.1, ]
-a = f_statistics_from_gene_permute(mat, order, g1, g2, num.permute=1e4)
-saveRDS(a,'./hca/geneexpr/result/f_statistics_from_lowExprGene_age_permute_new1e4.rds')
diff --git a/hca_bone_marrow_data_analysis/geneexpr/plot/01_plot.R b/hca_bone_marrow_data_analysis/geneexpr/plot/01_plot.R
deleted file mode 100644
index 3c46aa0..0000000
--- a/hca_bone_marrow_data_analysis/geneexpr/plot/01_plot.R
+++ /dev/null
@@ -1,144 +0,0 @@
-rm(list=ls())
-setwd('/home-4/whou10@jhu.edu/scratch/Wenpin/trajectory_variability/')
-# setwd('/Users/wenpinhou/Dropbox/trajectory_variability/')
-order = readRDS('./hca/result/ery/order.rds')
-mat = readRDS('./hca/data/HCA/proc/matrix/saver.rds')
-mat = mat[,order$Cell]
-mat <- mat[rowMeans(mat>0.01)>0.1, ]
-source('./function/01_function.R')
-order = data.frame(order, Patient = gsub('_.*','', order$Cell))
-ap = as.character(unique(order$Patient))
-g1 = ap[grepl('female', ap)]
-g2 = ap[grepl(':male', ap)]
-source('/home-4/whou10@jhu.edu/scratch/Wenpin/resource/function.R')
-# source('/Users/wenpinhou/Dropbox/resource/function.R')
-eg <- sapply(ap, function(p){
-  print(p)
-  tmat <- mat[,grepl(p, colnames(mat))]
-  rownames(tmat)[rowMeans(tmat>0.01)>0.1]
-})
-eg <- unique(unlist(eg))
-mat = mat[eg,]
-vg <- sapply(ap, function(p){
-  print(p)
-  tmat <- mat[,grepl(p, colnames(mat))]
-  vg <- findVariableGene(tmat, num.gene = NULL ,plot.statistics=TRUE, plot.dir = paste0('./hca/geneexpr/plot.mac/',p,'/'))
-})
-vg <- unique(unlist(vg))
-mat = mat[vg,]
-b = readRDS('./hca/geneexpr/result/f_statistics_from_gene_gender.rds')
-a = readRDS('./hca/geneexpr/result/f_statistics_from_lowExprGene_gender_permute_new1e4.rds')
-a = a[rownames(mat),]
-b = b[rownames(a)]
-pval <- sapply(seq(1,nrow(a)), function(i){
-  sum(a[i,]>b[i])/ncol(a)
-})
-names(pval) = rownames(a)
-fdr = p.adjust(pval,method='fdr')
-
-# ag <- names(sort(pval)[1:16])
-ag <- names(sort(b, decreasing=TRUE)[1:16])
-library(ggplot2)
-library(gridExtra)
-plist <- list()
-for (g in ag){
-  print(g)
-  pd1 = mat[g, grepl('female', colnames(mat))]
-  pd1 = data.frame(Expr=pd1, Cell=names(pd1), Patient = gsub('_.*','',names(pd1) ), Gender='Female')
-  pd2 = mat[g, grepl(':male', colnames(mat))]
-  pd2 = data.frame(Expr=pd2, Cell=names(pd2), Patient = gsub('_.*','',names(pd2) ), Gender='Male')
-  pd = rbind(pd1, pd2)
-  pd = cbind(pd, Pseudotime = order[match(pd$Cell, order$Cell),'Pseudotime'])
-  linedlist <- lapply(unique(pd$Patient), function(p){
-    tmat = mat[g,grepl(p,colnames(mat)),drop=F]
-    trainX = order$Pseudotime[grepl(p,colnames(mat))]
-    pred <- get_spline_fit(tmat, trainX=seq(1,ncol(tmat)), fit.min=min(order$Pseudotime), fit.max=max(order$Pseudotime))
-    tmpdf <- data.frame(Expr=pred[1,], Pseudotime=trainX, Patient=p, Gender=ifelse(grepl('female',p),'female','male'))
-  })
-  ld = do.call(rbind, linedlist)
-  plist[[g]] <- ggplot() + geom_point(data=pd, aes(x=Pseudotime, y=Expr, color=Patient), alpha=.1, size=.2)  + 
-    geom_line(data=ld, aes(x=Pseudotime, y=Expr, color=Patient),alpha=1, size=.5) +
-    theme_classic() + ggtitle(paste0(sub(':.*','',g),',p=', round(pval[g],3),',f=',round(b[g],2))) + theme(legend.position = 'none') + scale_color_manual(values=c(rep('darkblue',4),rep('orange',4)))
-  }
-pdf('./hca/geneexpr/plot.mac/gender_diff_gene_top_f.pdf',width=12,height=9)
-# pdf('./hca/geneexpr/plot.mac/gender_diff_gene_top_pval.pdf',width=12,height=9)
-grid.arrange(grobs=plist,nrow=4)
-dev.off()
-
-
-############# plot order permutation result
-u1 = readRDS('/home-4/whou10@jhu.edu/scratch/Wenpin/resource/chrX_genename.rds')
-u2 = readRDS('/home-4/whou10@jhu.edu/scratch/Wenpin/resource/chrY_genename.rds')
-allg = sub(':.*','',names(sort(pval)))
-str(allg)
-
-v1 <- sapply(seq(1,length(allg)), function(i){
-  sum(allg[seq(1,i)] %in% u1)
-})
-  
-v2 <- sapply(seq(1,length(allg)), function(i){
-  sum(allg[seq(1,i)] %in% u2)
-})
-v1_pm <- sapply(seq(1,1e2), function(myseed){
-  set.seed(myseed)
-  w1 = sample(allg, length(u1))
-  v1 <- sapply(seq(1,length(allg)), function(i){
-    sum(allg[seq(1,i)] %in% w1)
-  })
-})
-rownames(v1_pm) <- paste0('top',seq(1,nrow(v1_pm)))
-saveRDS(v1_pm, './hca/geneexpr/result/geneset_same_length_as_chrX_gene_pm_mean_order.rds')
-v1_pm <- rowMeans(v1_pm)
-v2_pm <- sapply(seq(1, 1e2), function(myseed){
-  print(myseed)
-  set.seed(myseed)
-  w2 = sample(allg, length(u2))  
-  v2 <- sapply(seq(1,length(allg)), function(i){
-    sum(allg[seq(1,i)] %in% w2)
-  })
-})  
-rownames(v2_pm) <- paste0('top',seq(1,nrow(v2_pm)))
-saveRDS(v2_pm, './hca/geneexpr/result/geneset_same_length_as_chrY_gene_pm_mean_order.rds')
-v2_pm <- rowMeans(v2_pm)
-df = data.frame(chrX=v1, chrY=v2, chrX_pm = v1_pm, chrY_pm = v2_pm, order = seq(1,length(v1)))
-saveRDS(df, './hca/geneexpr/result/df_chrX_chrY_pm_order.rds')
-mat <- NULL
-for (i in 1:4) {
-  mat <- rbind(mat,data.frame(v=df[,i],order=df[,5],type=colnames(df)[i]))
-}
-library(ggplot2)
-pdf('./hca/geneexpr/plot.mac/chrX_chrY_order_compare_to_permutation.pdf', width=4, height=4)
-ggplot(mat,aes(x=order,y=v,col=type, fill=type), alpha=.2) + geom_line() + xlim(c(0,30)) + ylim(c(0,10))+theme_classic()+ylab('number of ChrX/Y genes') + xlab('top n genes (ordered by increasing pvalue)')
-dev.off()
-
-## all chrX + chrY
-u = unique(c(u1,u2))
-v <- sapply(seq(1,length(allg)), function(i){
-  sum(allg[seq(1,i)] %in% u)
-})
-v_pm <- sapply(seq(1, 1e2), function(myseed){
-  print(myseed)
-  set.seed(myseed)
-  w = sample(allg, length(u))  
-  v <- sapply(seq(1,length(allg)), function(i){
-    sum(allg[seq(1,i)] %in% w)
-  })
-})  
-rownames(v_pm) <- paste0('top',seq(1,nrow(v_pm)))
-saveRDS(v_pm, './hca/geneexpr/result/geneset_same_length_as_chrXY_gene_pm_mean_order.rds')
-v_pm = rowMeans(v_pm)
-
-df = data.frame(chrXY = v, chrXY_pm = v_pm)
-saveRDS(df, './hca/geneexpr/result/df_chrXY_pm_order.rds')
-mat <- NULL
-for (i in 1:2) {
-  mat <- rbind(mat,data.frame(v=df[,i],order=seq(1,nrow(df)),type=colnames(df)[i]))
-}
-
-pdf('./hca/geneexpr/plot.mac/chrXY_order_compare_to_permutation.pdf', width=4, height=4)
-ggplot(mat,aes(x=order,y=v,col=type, fill=type), alpha=.2) + geom_line() + xlim(c(0,30)) + ylim(c(0,10))+theme_classic()+ylab('number of ChrX/Y genes') + xlab('top n genes (ordered by increasing pvalue)')
-dev.off()
-
-
-
-
diff --git a/hca_bone_marrow_data_analysis/geneexpr/plot/02_plot_age.R b/hca_bone_marrow_data_analysis/geneexpr/plot/02_plot_age.R
deleted file mode 100644
index bdb624f..0000000
--- a/hca_bone_marrow_data_analysis/geneexpr/plot/02_plot_age.R
+++ /dev/null
@@ -1,73 +0,0 @@
-rm(list=ls())
-setwd('/home-4/whou10@jhu.edu/scratch/Wenpin/trajectory_variability/')
-# setwd('/Users/wenpinhou/Dropbox/trajectory_variability/')
-order = readRDS('./hca/result/ery/order.rds')
-mat = readRDS('./hca/data/HCA/proc/matrix/saver.rds')
-mat = mat[,order$Cell]
-mat <- mat[rowMeans(mat>0.01)>0.1, ]
-source('./function/01_function.R')
-order = data.frame(order, Patient = gsub('_.*','', order$Cell))
-ap = as.character(unique(order$Patient))
-g1 = ap[grepl('female', ap)]
-g2 = ap[grepl(':male', ap)]
-# source('/Users/wenpinhou/Dropbox/resource/function.R')
-source('/home-4/whou10@jhu.edu/scratch/Wenpin/resource/function.R')
-eg <- sapply(ap, function(p){
-  print(p)
-  tmat <- mat[,grepl(p, colnames(mat))]
-  rownames(tmat)[rowMeans(tmat>0.01)>0.1]
-})
-eg <- unique(unlist(eg))
-mat = mat[eg,]
-vg <- sapply(ap, function(p){
-  print(p)
-  tmat <- mat[,grepl(p, colnames(mat))]
-  vg <- findVariableGene(tmat, num.gene = NULL ,plot.statistics=TRUE, plot.dir = paste0('./hca/geneexpr/plot.mac/',p,'/'))
-})
-vg <- unique(unlist(vg))
-mat = mat[vg,]
-b = readRDS('./hca/geneexpr/result/f_statistics_from_gene_age.rds')
-a = readRDS('./hca/geneexpr/result/f_statistics_from_lowExprGene_age_permute_new1e4.rds')
-a = a[rownames(mat),]
-b = b[rownames(a)]
-pval <- sapply(seq(1,nrow(a)), function(i){
-  sum(a[i,]>b[i])/ncol(a)
-})
-names(pval) = rownames(a)
-fdr = p.adjust(pval,method='fdr')
-pdf('./hca/geneexpr/plot.mac/age_diff_f_p_fdr.pdf', width=7,height=4)
-par(mfrow=c(1,2))
-smoothScatter(pval~b, xlab='f statistics', ylab='p-value')
-smoothScatter(fdr~b, xlab='f statistics', ylab='fdr')
-dev.off()
-ag <- names(sort(pval)[1:16])
-# ag <- names(sort(b, decreasing=TRUE)[1:16])
-library(ggplot2)
-library(gridExtra)
-plist <- list()
-for (g in ag){
-  print(g)
-  pd1 = mat[g, grepl('female', colnames(mat))]
-  pd1 = data.frame(Expr=pd1, Cell=names(pd1), Patient = gsub('_.*','',names(pd1) ), Gender='Female')
-  pd2 = mat[g, grepl(':male', colnames(mat))]
-  pd2 = data.frame(Expr=pd2, Cell=names(pd2), Patient = gsub('_.*','',names(pd2) ), Gender='Male')
-  pd = rbind(pd1, pd2)
-  pd = cbind(pd, Pseudotime = order[match(pd$Cell, order$Cell),'Pseudotime'])
-  linedlist <- lapply(unique(pd$Patient), function(p){
-    tmat = mat[g,grepl(p,colnames(mat)),drop=F]
-    trainX = order$Pseudotime[grepl(p,colnames(mat))]
-    pred <- get_spline_fit(tmat, trainX=seq(1,ncol(tmat)), fit.min=min(order$Pseudotime), fit.max=max(order$Pseudotime))
-    tmpdf <- data.frame(Expr=pred[1,], Pseudotime=trainX, Patient=p, Gender=ifelse(grepl('female',p),'female','male'))
-  })
-  ld = do.call(rbind, linedlist)
-  plist[[g]] <- ggplot() + geom_point(data=pd, aes(x=Pseudotime, y=Expr, color=Patient), alpha=.1, size=.2)  + 
-    geom_line(data=ld, aes(x=Pseudotime, y=Expr, color=Patient),alpha=1, size=.5) +
-    theme_classic() + ggtitle(paste0(sub(':.*','',g),',p=', round(pval[g],3),',f=',round(b[g],2))) + theme(legend.position = 'none') + scale_color_manual(values=c(rep('darkblue',4),rep('orange',4)))
-  }
-# pdf('./hca/geneexpr/plot.mac/age_diff_gene_top_f.pdf',width=12,height=9)
-pdf('./hca/geneexpr/plot.mac/age_diff_gene_top_pval.pdf',width=12,height=9)
-grid.arrange(grobs=plist,nrow=4)
-dev.off()
-
-
-
diff --git a/hca_bone_marrow_data_analysis/test_type_position/code/01_test_type_position.R b/hca_bone_marrow_data_analysis/test_type_position/code/01_test_type_position.R
deleted file mode 100644
index cc298af..0000000
--- a/hca_bone_marrow_data_analysis/test_type_position/code/01_test_type_position.R
+++ /dev/null
@@ -1,60 +0,0 @@
-library(parallel)
-source('/home-4/whou10@jhu.edu/scratch/Wenpin/trajectory_variability/function/01_function.R')
-plotdir <- '/home-4/whou10@jhu.edu/scratch/Wenpin/trajectory_variability/test_type_position/plot/testvar/clusterType9_1/'
-rdir <- '/home-4/whou10@jhu.edu/scratch/Wenpin/trajectory_variability/test_type_position/result/testvar/clusterType9_1/'
-d <- readRDS('/home-4/whou10@jhu.edu/scratch/Wenpin/trajectory_variability/testvar/data/data/count/clusterType9_1.rds')
-rownames(d) <- sub(':.*','',rownames(d))
-m = log2(d + 1)
-pt <- readRDS('/home-4/whou10@jhu.edu/scratch/Wenpin/trajectory_variability/testtime/data/data/null/pseudotime.rds')
-pseudotime = pt[,2]
-names(pseudotime) = pt[,1]
-ap <- sub(':.*', '', colnames(m))
-design = cbind(1, c(1,1,0,0,1,1,0,0))
-rownames(design) = paste0('BM', seq(1,8))
-colnames(design) <- c('intersect', 'condition')
-ca <- data.frame(Cell = colnames(m), Sample = ap)
-dir.create(plotdir, showWarnings = FALSE, recursive = TRUE)
-dir.create(rdir, showWarnings = FALSE, recursive = TRUE)
-##  test and plot
-tmp <- mclapply(c('all', 'start', 'middle', 'end'), function(pos){
-      ## slope only
-      res <- testpt(expr = m, cellanno = ca, pseudotime = pseudotime, design=design, permuiter=10, EMmaxiter=100, EMitercutoff=1, verbose=F, ncores=1, type='Variable', test.slope.only = TRUE, test.position = pos)
-      saveRDS(res, paste0(rdir, 'slope_', pos, '.rds'))
-      gene = names(rev(sort(abs(res$meandiff))))
-      pdf(paste0(plotdir, 'slope_', pos, '_meandiff_genes.pdf'), width = 8, height = 8)
-      print(plotGene(testptObj = res, gene = gene[1:16], variable = 'condition'))
-      dev.off()
-      
-      gene = names(sort(res$fdr))
-      pdf(paste0(plotdir, 'slope_', pos, '_fdr_genes.pdf'), width = 8, height = 8)
-      print(plotGene(testptObj = res, gene = gene[1:16], variable = 'condition'))
-      dev.off()
-      pdf(paste0(plotdir, 'slope_', pos, '_fdr_foldchange.pdf'), width = 4, height = 4)
-      print(plot(res$foldchange ~ res$fdr[names(res$foldchange)], pch = 20, xlab = 'fdr', ylab = 'LL foldchange'))
-      dev.off()
-      pdf(paste0(plotdir, 'slope_', pos, '_fdr_meandiff.pdf'), width = 4, height = 4)
-      print(plot(res$meandiff ~ res$fdr[names(res$meandiff)], pch = 20, xlab = 'fdr', ylab = 'group mean difference'))
-      dev.off()
-      
-        
-      ## all (intersept + slope)
-      res <- testpt(expr = m, cellanno = ca, pseudotime = pseudotime, design=design, permuiter=10, EMmaxiter=100, EMitercutoff=1, verbose=F, ncores=1, type='Variable', test.slope.only = FALSE, test.position = pos)
-      saveRDS(res, paste0(rdir, 'all_', pos, '.rds'))
-      gene = names(rev(sort(abs(res$meandiff))))
-      pdf(paste0(plotdir, 'all_', pos, '_meandiff_genes.pdf'), width = 8, height = 8)
-      print(plotGene(testptObj = res, gene = gene[1:16], variable = 'condition'))
-      dev.off()
-      
-      gene = names(sort(res$fdr))
-      pdf(paste0(plotdir, 'all_', pos, '_fdr_genes.pdf'), width = 8, height = 8)
-      print(plotGene(testptObj = res, gene = gene[1:16], variable = 'condition'))
-      dev.off()
-      pdf(paste0(plotdir, 'all_', pos, '_fdr_foldchange.pdf'), width = 4, height = 4)
-      print(plot(res$foldchange ~ res$fdr[names(res$foldchange)], pch = 20, xlab = 'fdr', ylab = 'LL foldchange'))
-      dev.off()
-      pdf(paste0(plotdir, 'slope_', pos, '_fdr_meandiff.pdf'), width = 4, height = 4)
-      print(plot(res$meandiff ~ res$fdr[names(res$meandiff)], pch = 20, xlab = 'fdr', ylab = 'group mean difference'))
-      dev.off()
-      
-      return(0)
-}, mc.cores = 8)
diff --git a/hca_bone_marrow_data_analysis/test_type_position/code/02_test_type_position_meandiff.R b/hca_bone_marrow_data_analysis/test_type_position/code/02_test_type_position_meandiff.R
deleted file mode 100644
index 9280e56..0000000
--- a/hca_bone_marrow_data_analysis/test_type_position/code/02_test_type_position_meandiff.R
+++ /dev/null
@@ -1,51 +0,0 @@
-library(parallel)
-source('/home-4/whou10@jhu.edu/scratch/Wenpin/trajectory_variability/function/01_function.R')
-plotdir <- '/home-4/whou10@jhu.edu/scratch/Wenpin/trajectory_variability/test_type_position/plot/'
-rdir <- '/home-4/whou10@jhu.edu/scratch/Wenpin/trajectory_variability/test_type_position/result/'
-d <- readRDS('/home-4/whou10@jhu.edu/scratch/Wenpin/trajectory_variability/testvar/data/data/count/clusterType9_4.rds')
-rownames(d) <- sub(':.*','',rownames(d))
-m = log2(d + 1)
-pt <- readRDS('/home-4/whou10@jhu.edu/scratch/Wenpin/trajectory_variability/testtime/data/data/null/pseudotime.rds')
-pseudotime = pt[,2]
-names(pseudotime) = pt[,1]
-ap <- sub(':.*', '', colnames(m))
-design = cbind(1, c(1,1,0,0,1,1,0,0))
-rownames(design) = paste0('BM', seq(1,8))
-colnames(design) <- c('intersect', 'condition')
-ca <- data.frame(Cell = colnames(m), Sample = ap)
-
-##  test and plot
-tmp <- mclapply(c('all', 'start', 'middle', 'end'), function(pos){
-      res <- testpt(expr = m, cellanno = ca, pseudotime = pseudotime, design=design, permuiter=10, EMmaxiter=100, EMitercutoff=1, verbose=F, ncores=1, type='Variable', test.slope.only = TRUE, test.position = pos)
-      saveRDS(res, paste0(rdir, 'slope_', pos, '.rds'))
-      gene = names(rev(sort(abs(res$meandiff))))
-      pdf(paste0(plotdir, 'slope_', pos, '_meandiff_genes.pdf'), width = 8, height = 8)
-      print(plotGene(testptObj = res, gene = gene[1:16], variable = 'condition'))
-      dev.off()
-      
-      gene = names(sort(res$fdr))
-      pdf(paste0(plotdir, 'slope_', pos, '_fdr_genes.pdf'), width = 8, height = 8)
-      print(plotGene(testptObj = res, gene = gene[1:16], variable = 'condition'))
-      dev.off()
-      pdf(paste0(plotdir, 'slope_', pos, '_fdr_foldchange.pdf'), width = 4, height = 4)
-      print(plot(res$foldchange ~ res$fdr[names(res$foldchange)], pch = 20, xlab = 'fdr', ylab = 'LL foldchange'))
-      dev.off()
-        
-      
-      res <- testpt(expr = m, cellanno = ca, pseudotime = pseudotime, design=design, permuiter=10, EMmaxiter=100, EMitercutoff=1, verbose=F, ncores=1, type='Variable', test.slope.only = FALSE, test.position = pos)
-      saveRDS(res, paste0(rdir, 'all_', pos, '.rds'))
-      gene = names(rev(sort(abs(res$meandiff))))
-      pdf(paste0(plotdir, 'all_', pos, '_meandiff_genes.pdf'), width = 8, height = 8)
-      print(plotGene(testptObj = res, gene = gene[1:16], variable = 'condition'))
-      dev.off()
-      
-      gene = names(sort(res$fdr))
-      pdf(paste0(plotdir, 'all_', pos, '_fdr_genes.pdf'), width = 8, height = 8)
-      print(plotGene(testptObj = res, gene = gene[1:16], variable = 'condition'))
-      dev.off()
-      pdf(paste0(plotdir, 'all_', pos, '_fdr_foldchange.pdf'), width = 4, height = 4)
-      print(plot(res$foldchange ~ res$fdr[names(res$foldchange)], pch = 20, xlab = 'fdr', ylab = 'LL foldchange'))
-      dev.off()
-      return(0)
-}, mc.cores = 4)
-
diff --git a/hca_bone_marrow_data_analysis/testpattern/code/01_test.R b/hca_bone_marrow_data_analysis/testpattern/code/01_test.R
deleted file mode 100644
index 52349ba..0000000
--- a/hca_bone_marrow_data_analysis/testpattern/code/01_test.R
+++ /dev/null
@@ -1,135 +0,0 @@
-# ------------
-# prepare data
-# ------------
-library(parallel)
-library(splines)
-data <- as.character(commandArgs(trailingOnly = TRUE)[[1]][1]) ## clusterType9_1
-print(paste0('Analyzing ',data, '...'))
-source('/home-4/whou10@jhu.edu/scratch/Wenpin/trajectory_variability/function/01_function.R')
-rdir <- paste0('/home-4/whou10@jhu.edu/scratch/Wenpin/trajectory_variability/testpattern/result/', data, '/')
-plotdir <- paste0('/home-4/whou10@jhu.edu/scratch/Wenpin/trajectory_variability/testpattern/plot/', data, '/')
-dir.create(rdir, recursive = TRUE)
-dir.create(plotdir, recursive = TRUE)
-d <- readRDS(paste0('/home-4/whou10@jhu.edu/scratch/Wenpin/trajectory_variability/testvar/data/data/saver/', data, '.rds'))
-rownames(d) <- sub(':.*','',rownames(d))
-d <- d[!duplicated(rownames(d)), ]
-m = log2(d + 1)
-pt <- readRDS('/home-4/whou10@jhu.edu/scratch/Wenpin/trajectory_variability/testtime/data/data/null/pseudotime.rds')
-pseudotime = pt[,2]
-names(pseudotime) = pt[,1]
-ap <- sub(':.*', '', colnames(m))
-design = cbind(1, c(1,1,0,0,1,1,0,0))
-rownames(design) = paste0('BM', seq(1,8))
-colnames(design) <- c('intersect', 'condition')
-ca <- data.frame(Cell = colnames(m), Sample = ap, stringsAsFactors = FALSE)
- 
-# -----
-# test
-# -----
-system.time({
-  Res <- ptest(expr = m, cellanno = ca, pseudotime = pseudotime, design=design, permuiter=10, EMmaxiter=100, EMitercutoff=1, verbose=F, ncores=detectCores(), type='Variable', fit.resolution = 1000)
-  saveRDS(Res, paste0(rdir, 'ptest_res.rds'))
-})
-
-#     user   system  elapsed 
-# 74525.76 11940.81  3409.03 
-
-names(Res)
-str(Res$res)
-
-# check <<<<<<<<<<<<<
-selgene <- readRDS('/home-4/whou10@jhu.edu/scratch/Wenpin/trajectory_variability/testvar/data/data/selgene/selgene.rds')
-selgene <- intersect(sub(':.*', '', selgene), rownames(res))
-apply(res[selgene, ], 2, summary)
-apply(res[!rownames(res) %in% selgene, ], 2, summary)
-
-# >>>>>>>>>>>>>>>>
-
-# ----------------------
-# plot significant genes
-# ----------------------
-res <- Res$res
-write.csv(Res$res, file = paste0(rdir, 'ptest_res.csv'), quote = FALSE)
-# mean diff
-res1 <- res[res$meandiff.fdr < 0.05, ]
-
-
-if (nrow(res1) > 0){
-  write.csv(res1, file = paste0(rdir, 'ptest_interceptdiff.csv'), quote = FALSE)
-  pdf(paste0(plotdir, 'interceptdiff_fdr_gene.pdf'), width = 10, height = 7)
-  gene = rownames(res1)[order(res1[,'meandiff.fdr'])]
-  print(plotGene(testptObj = Res, gene = gene[1:min(16, nrow(res1))], variable = 'condition', plot.point = T, point.alpha = 0.1, point.size = 0.1))
-  dev.off()
-  
-  pdf(paste0(plotdir, 'interceptdiff_diff_gene.pdf'), width = 10, height = 7)
-  gene = rownames(res1)[order(abs(res1[,'meandiff.diff']), decreasing = TRUE)]
-  print(plotGene(testptObj = Res, gene = gene[1:min(16, nrow(res1))], variable = 'condition', plot.point = T, point.alpha = 0.1, point.size = 0.1))
-  dev.off()
-  
-  pdf(paste0(plotdir, 'interceptdiff_lfc_gene.pdf'), width = 10, height = 7)
-  gene = rownames(res1)[order(abs(res1[,'meandiff.lfc']), decreasing = TRUE)]
-  print(plotGene(testptObj = Res, gene = gene[1:min(16, nrow(res1))], variable = 'condition', plot.point = T, point.alpha = 0.1, point.size = 0.1))
-  dev.off()
-} else {
-  print('No interceptdiff fdr < 0.05!')
-}
-  
-# trend diff
-res2 <- res[res$trenddiff.fdr < 0.05, ]
-if (nrow(res2) > 0){
-  write.csv(res2, file = paste0(rdir, 'ptest_trenddiff.csv'), quote = FALSE)
-  pdf(paste0(plotdir, 'trenddiff_fdr_gene.pdf'), width = 10, height = 7)
-  gene = rownames(res2)[order(res2[,'trenddiff.fdr'])]
-  print(plotGene(testptObj = Res, gene = gene[1:min(16, nrow(res2))], variable = 'condition', plot.point = T, point.alpha = 0.1, point.size = 0.1))
-  dev.off()
-  
-  pdf(paste0(plotdir, 'trenddiff_diff_gene.pdf'), width = 10, height = 7)
-  gene = rownames(res2)[order(abs(res2[,'trenddiff.diff']), decreasing = TRUE)]
-  print(plotGene(testptObj = Res, gene = gene[1:min(16, nrow(res2))], variable = 'condition', plot.point = T, point.alpha = 0.1, point.size = 0.1))
-  dev.off()
-  
-  pdf(paste0(plotdir, 'trenddiff_lfc_gene.pdf'), width = 10, height = 7)
-  gene = rownames(res2)[order(abs(res2[,'trenddiff.lfc']), decreasing = TRUE)]
-  print(plotGene(testptObj = Res, gene = gene[1:min(16, nrow(res2))], variable = 'condition', plot.point = T, point.alpha = 0.1, point.size = 0.1))
-  dev.off()
-} else {
-  print('No trenddiff fdr < 0.05!')
-}
- 
-## intercept diff but no trend diff
-res3 <- res[res$meandiff.fdr < 0.05 & res$trenddiff.fdr > 0.05, ]
-if (nrow(res3) > 0){
-  write.csv(res3, file = paste0(rdir, 'ptest_interceptdiff_butNoTrenddiff.csv'), quote = FALSE)
-  pdf(paste0(plotdir, 'interceptdiff_butNoTrenddiff_fdr_gene.pdf'), width = 10, height = 7)
-  gene = rownames(res3)[order(res3[,'trenddiff.fdr'])]
-  print(plotGene(testptObj = Res, gene = gene[1:min(16, nrow(res3))], variable = 'condition', plot.point = T, point.alpha = 0.1, point.size = 0.1))
-  dev.off()
-}
-  
-# ------------------------
-# plot insignificant genes
-# ------------------------
-# intercept diff
-res4 <- res[res$meandiff.fdr > 0.05, , drop = FALSE]
-if (nrow(res4) > 0){
-  write.csv(res4, file = paste0(rdir, 'ptest_interceptdiff_insig.csv'), quote = FALSE)
-  pdf(paste0(plotdir, 'interceptdiff_fdr_gene_insig.pdf'), width = 10, height = 7)
-  gene = rev(rownames(res4)[order(res4[,'meandiff.fdr'])])
-  print(plotGene(testptObj = Res, gene = gene[1:min(16, nrow(res4))], variable = 'condition', plot.point = T, point.alpha = 0.1, point.size = 0.1))
-  dev.off()
-} else {
-  print('No interceptdiff fdr > 0.05!')
-}
-  
-# trend diff
-res5 <- res[res$trenddiff.fdr > 0.05, , drop = FALSE]
-if (nrow(res5) > 0){
-  write.csv(res5, file = paste0(rdir, 'ptest_trenddiff_insig.csv'), quote = FALSE)
-  pdf(paste0(plotdir, 'trenddiff_fdr_gene_insig.pdf'), width = 10, height = 7)
-  gene = rownames(res5)[order(res5[,'trenddiff.fdr'])]
-  print(plotGene(testptObj = Res, gene = gene[1:min(16, nrow(res5))], variable = 'condition', plot.point = T, point.alpha = 0.1, point.size = 0.1))
-  dev.off()
-} else {
-  print('No trenddiff fdr > 0.05!')
-}
-
diff --git a/hca_bone_marrow_data_analysis/testpattern/code/02_population_pattern.R b/hca_bone_marrow_data_analysis/testpattern/code/02_population_pattern.R
deleted file mode 100644
index 3690412..0000000
--- a/hca_bone_marrow_data_analysis/testpattern/code/02_population_pattern.R
+++ /dev/null
@@ -1,98 +0,0 @@
-# new a function
-# input: testpt output including beta, phi
-# input: covariables values user wants to know , if NULL then the unique values of the covatiates in the testpt data. if in the data only have age = 10, 20, 30, users can input 25 then we can output the pseudotime pattern of age == 25.
-# phi * x * beta
-
-data <- as.character(commandArgs(trailingOnly = T)[[1]][1])
-# data = 'clusterType10_1'
-source('/home-4/whou10@jhu.edu/scratch/Wenpin/resource/myfunc/01_function.R')
-source('/home-4/whou10@jhu.edu/scratch/Wenpin/trajectory_variability/function/01_function.R')
-plotdir <- paste0('/home-4/whou10@jhu.edu/scratch/Wenpin/trajectory_variability/testpattern/plot/', data, '/')
-rdir <- paste0('/home-4/whou10@jhu.edu/scratch/Wenpin/trajectory_variability/testpattern/result/', data, '/')
-
-Res <- readRDS(paste0('/home-4/whou10@jhu.edu/scratch/Wenpin/trajectory_variability/testpattern/result/', data, '/ptest_res.rds'))
-res <- Res$res
-head(res)
-res <- res[res[,1] < 0.05 | res[,4] < 0.05, ]
-gene <- rownames(res[res[,4] < 0.05, ])
-expr = Res$expression[gene, ]
-knotnum = Res$knotnum[gene]
-design = Res$design
-cellanno = Res$cellanno
-rownames(cellanno) = cellanno[,1]
-pseudotime = Res$pseudotime
-
-beta <- lapply(Res$trenddiff.parameter[gene], function(i){
-  i$beta
-})
-names(beta) <- gene
-predict.values <- Res$predict.values[gene,]
-
-pseudotime = pseudotime[order(pseudotime)]
-expr = expr[, names(pseudotime)]
-predict.values = predict.values[, names(pseudotime)]
- 
-# plot(fit1~pseudotime, col = 'red', pch = 20, cex = .5,ylim=c(0,3))
-# points(fit2~pseudotime, pch = 20, cex = .5)
-library(splines)
-fit <- sapply(gene, function(g){
-  print(g)
-  fit <- get_population_fit(Res, 'condition', g = g)
-  vn <- sapply(1:length(fit), function(i){
-    paste0(names(fit)[i], ';', rownames(fit[[i]]))
-  })
-  v <- as.vector(do.call(cbind, fit))
-  names(v) <- vn
-  v
-})
-
-clu <- mykmeans(fit,10)$cluster
-agg <- aggregate(t(fit),list(clu),mean)
-agg <- agg[,-1]
-agg <- as.matrix(agg)
-rownames(agg) <- paste0('cluster', seq(1, nrow(agg)))
-
-saveRDS(fit, paste0(rdir, 'trenddiff_gene_popoulation_fit_10clu.rds'))
-saveRDS(clu, paste0(rdir, 'trenddiff_gene_cluster_10clu.rds'))
-saveRDS(agg, paste0(rdir, 'trenddiff_gene_agg_10clu.rds'))
-
-library(reshape2)
-pd <- melt(agg)
-pd$covariate <- as.factor(sub(';.*', '', pd[,2]))
-pd$x <- as.numeric(pseudotime[sub('.*;', '', pd[,2])])
-pd$cell <- sub('.*;', '', pd$Var2)
-library(ggplot2)
-
-pdf(paste0(plotdir, 'population_level_trenddiff_gene_10clu.pdf'), width = 4, height = max(clu)*2)
-ggplot() +
-  geom_point(data = pd, aes(x = x, y = value, group = covariate,color = pd$covariate)) +
-  theme_classic() +
-  scale_color_brewer(palette = 'Dark2') +
-  xlab('Pseudotime') + ylab('Expression') +
-  labs(color = NULL) +
-  facet_wrap(~Var1, ncol = 1) 
-dev.off()
-
-library(pheatmap)
-library(RColorBrewer)
-hmpd <- predict.values[names(sort(clu)), names(pseudotime)]
-hmpd <- hmpd[, order(as.character(pd[match(colnames(hmpd), pd$cell),'covariate']), pseudotime[colnames(hmpd)])]
-anno_row <- data.frame(cluster = as.factor(clu[rownames(hmpd)]))
-rownames(anno_row) = rownames(hmpd)
-anno_col <- data.frame(covariate = pd[match(colnames(hmpd), pd$cell),'covariate'])
-rownames(anno_col) <- colnames(hmpd)
-
-
-png(paste0(plotdir, 'population_level_trenddiff_gene_hm_10clu.png'), width = 600, height = 600)
-pheatmap(hmpd, cluster_cols = FALSE, cluster_rows = FALSE, 
-         show_rownames = FALSE, show_colnames = FALSE,
-         annotation_row = anno_row,
-         annotaton_col = anno_col)
-dev.off()
-
-
-selgene <- readRDS('/home-4/whou10@jhu.edu/scratch/Wenpin/trajectory_variability/testvar/data/data/selgene/selgene.rds')
-selgene = sub(':.*','',selgene)
-df <- data.frame(interceptdiff = ifelse(res[,1]<0.05, 'interceptdiff', 'nointerceptdiff'), trenddiff = ifelse(res[,4]<0.05, 'trenddiff','notrenddiff'), selgene = ifelse(rownames(res) %in% selgene, 'true', 'false'))
-rownames(df) = rownames(res)
-
diff --git a/hca_bone_marrow_data_analysis/tree_variability/auto_pc_14_clu_permute1e2/code/01_reproducibility.R b/hca_bone_marrow_data_analysis/tree_variability/auto_pc_14_clu_permute1e2/code/01_reproducibility.R
deleted file mode 100644
index a74ad5e..0000000
--- a/hca_bone_marrow_data_analysis/tree_variability/auto_pc_14_clu_permute1e2/code/01_reproducibility.R
+++ /dev/null
@@ -1,254 +0,0 @@
-rm(list=ls())
-library(ggplot2)
-library(Seurat)
-library(reshape2)
-library(TSCAN)
-n.permute <- 100
-suppressMessages(library(igraph))
-setwd("/Users/wenpinhou/Dropbox/trajectory_variability/hca/data/HCA/proc/integrate")
-
-umap = readRDS('umap.rds')
-pca <- as.matrix(umap@reductions$pca@cell.embeddings)
-
-ctlevel <- data.frame(ct=c('HSC','MPP','LMPP','CMP','CLP','GMP','MEP',"Bcell","CD4Tcell","CD8Tcell",'NKcell','Mono','Ery'),level=c(1,2,3,3,4,4,4,5,5,5,5,5,5),immunepath=c(1,1,1,0,1,0,0,1,1,1,1,0,0),monopath=c(1,1,1,1,0,1,0,0,0,0,0,1,0),erypath=c(1,1,0,1,0,0,1,0,0,0,0,0,1),stringsAsFactors = F)
-str(pca)
-
-a = readRDS('/Users/wenpinhou/Dropbox/trajectory_variability/hca/data/HCA/proc/ct/sc.rds')
-ct = data.frame(cell = names(a), celltype = a, stringsAsFactors = FALSE)
-
-
-### determine numPC
-mykmeans <- function(matrix, number.cluster = NA){
-  ## cluster the rows
-  set.seed(12345)
-  library(parallel)
-  if (is.na(number.cluster)){
-    maxclunum <- 20
-    rss <- mclapply(1:maxclunum,function(clunum) {
-      tmp <- kmeans(matrix,clunum,iter.max = 1000)
-      tmp$betweenss/tmp$totss
-    },mc.cores=20)
-    rss <- unlist(rss)
-    x <- 1:maxclunum
-    optclunum <- which.min(sapply(1:maxclunum, function(i) {
-        x2 <- pmax(0, x - i)
-        sum(lm(rss ~ x + x2)$residuals^2)  ## check this
-    }))
-    clu <- kmeans(matrix,optclunum)
-  } else {
-    clu <- kmeans(matrix, number.cluster)    
-  }
-    return(clu)
-}
-
-set.seed(12345)
-sdev <- apply(pca, 2, sd)
-x <- 1:50
-optpoint <- which.min(sapply(2:20, function(i) {
-  x2 <- pmax(0, x - i)
-  sum(lm(sdev ~ x + x2)$residuals^2)
-}))
-pcadim = optpoint + 1
-pr <- pca[,1:pcadim]  # 7
-# pr <- pca[,1:2]
-
-## clustering
-clu <- mykmeans(pr, number.cluster = 14)$cluster
-pd = data.frame(x = pr[,1], y = pr[,2], clu = as.factor(clu[rownames(pr)]))
-
-mypalette = colorRampPalette(brewer.pal(9,'Set1'))
-ggplot(data = pd, aes(x = x, y = y, color = clu)) + 
-  geom_scattermore()+
-  scale_color_manual(values = mypalette(14))
-
-## cell type composition in clusters
-pd = cbind(pd, celltype = ct[match(rownames(pd), ct[,1]),2])
-tab <- table(pd[,3:4])
-tab <- tab/rowSums(tab)
-pd <- melt(tab)
-pd$clu <- factor(as.character(pd$clu), levels = seq(1,max(pd$clu)))
-
-ggplot(data = pd) +
-  geom_bar(aes(x = clu, y = value, fill = celltype), stat = 'identity', position = 'dodge') +
-  theme_classic() +
-  ylab('Celltype Proportion') +
-  scale_fill_manual(values = mypalette(length(unique(pd$celltype))))
-
-# tmp <- which.min(sapply(1:clun,function(scn) mean(ctlevel[match(ct[match(names(clu)[clu==scn],ct[,1]),3],ctlevel[,1]),2],na.rm=T)))
-# 
-
-###
-mcl <- exprmclust(t(pr),cluster=clu,reduce=F)
-# mcl <- exprmclust(t(pr), reduce = F)
-plotmclust(mcl, cell_point_size = 0.1)
-str(mcl)
-## find origin
-tmp <- pd[pd$celltype == 'HSC', ]
-origin.cluster <- as.numeric(tmp[which.max(tmp[,3]), 1])
-
-
-ord <- TSCANorder(mcl, startcluster = origin.cluster, listbranch = T,orderonly = T)
-str(ord)
-length(ord)
-pt <- data.frame(cell = unname(unlist(ord)), time = c(1:length(ord[[1]]), 1:length(ord[[2]]), 1:length(ord[[3]]), 1:length(ord[[4]]), 1:length(ord[[5]]), 1:length(ord[[6]])), stringsAsFactors = FALSE)
-
-## plot pseudotime
-pd = data.frame(pc1 = pca[,1], pc2 = pca[,2], time = as.numeric(pt[match(rownames(pca), pt[,1]),2]))
-library(scattermore)
-library(RColorBrewer)
-
-
-ggplot(data = pd, aes(x = pc1, y = pc2, color = time)) +
-  geom_scattermore() +
-  scale_color_gradient(low = 'yellow', high = 'blue')
-
-# -----------
-# permutation 
-# -----------
-jslist <- oclist <- list()
-for (pmid in seq(1, n.permute)){
-  ## boostrap cells
-  print(pmid)
-  set.seed(pmid)
-  pr.pm <- pr[sample(rownames(pr), nrow(pr), replace = TRUE),]
-  pr.pm <- pr.pm[!duplicated(rownames(pr.pm)),]
-  
-  ## cluster cells
-  clu <- mykmeans(pr.pm, number.cluster = 14)$cluster ###
-  mcl.pm <- exprmclust(t(pr.pm), cluster = clu, reduce = FALSE) ###
-  # plotmclust(mcl.pm, cell_point_size = 0.1)
-  
-  ## select origin cluster
-  pt.pm.mean<- tapply(pt[match(names(mcl.pm[['clusterid']]), pt[,1]),2], list(mcl.pm[['clusterid']]), mean)
-  start.cluster <- names(which.min(pt.pm.mean))
-  
-  ## construct pseudotime
-  ord.pm <- TSCANorder(mcl.pm, startcluster = start.cluster, listbranch = T,orderonly = T)
-  # str(ord.pm)
-  
-  ## plot pseudotime
-  
-  pt.pm <- data.frame(cell = unname(unlist(ord.pm)), time = unlist(sapply(sapply(ord.pm, length), function(i) seq(1, i))), stringsAsFactors = FALSE)
-  pd = data.frame(pc1 = pca[,1], pc2 = pca[,2], time = as.numeric(pt.pm[match(rownames(pca), pt.pm[,1]),2]))
-  library(scattermore)
-  library(RColorBrewer)
-  # ggplot(data = pd, aes(x = pc1, y = pc2, color = time)) +
-  #   geom_scattermore()
-
-  ## compare two MST
-  js <- sapply(seq(1, length(ord)), function(i){
-          sapply(seq(1, length(ord.pm)), function(j){
-            b.ori <- ord[[i]]
-            b.pm <- ord.pm[[j]]
-            js <- length(intersect(b.pm, b.ori))/length(union(b.pm, b.ori))
-          })
-      })
-  
-  oc <- sapply(seq(1, length(ord)), function(i){
-           sapply(seq(1, length(ord.pm)), function(j){
-              b.ori <- ord[[i]]
-              b.pm <- ord.pm[[j]]
-              oc <- length(intersect(b.pm, b.ori))/min(length(b.pm), length(b.ori))
-           }) 
-        })
-  colnames(oc) <- colnames(js) <- paste0('original', seq(1, length(ord)))
-  jslist[[pmid]] <- js
-  oclist[[pmid]] <- oc
-}
-saveRDS(jslist, '/Users/wenpinhou/Dropbox/trajectory_variability/tree_variability/result/pm_js.rds')   
-saveRDS(oclist, '/Users/wenpinhou/Dropbox/trajectory_variability/tree_variability/result/pm_oc.rds')   
-
-
-
-jsm <- do.call(rbind, jslist)
-ocm <- do.call(rbind, oclist)
-str(jsm)
-str(ocm)
-par(mfrow = c(1,2))
-hist(jsm)
-hist(ocm)
-
-js.cut <- 0.5
-oc.cut <- 0.6
-
-res <- sapply(seq(1,length(jslist)), function(i){
-  js <- jslist[[i]]
-  js.binary <- (js > js.cut) + 0
-
-  while (length(which(rowSums(js.binary) > 1)) > 0 | length(which(colSums(js.binary) > 1)) > 0){
-    dup.id <- which(rowSums(js.binary) > 1)
-    if (length(dup.id) == 1){
-      addid <- which.max(js[dup.id, ])
-      js.binary[dup.id, ] <- 0
-      js.binary[dup.id, addid] <- 1  
-    } else if (length(dup.id) > 1) {
-      for (dup.i in dup.id){
-        addid <- which.max(js[dup.i, ])
-        js.binary[dup.i, ] <- 0
-        js.binary[dup.i, addid] <- 1  
-      }
-    }
-      
-    dup.id <- which(colSums(js.binary) > 1)
-    if (length(dup.id) == 1){
-      addid <- which.max(js[, dup.id])
-      js.binary[dup.id, ] <- 0
-      js.binary[addid, dup.id] <- 1  
-    } else if (length(dup.id) > 1) {
-      for (dup.i in dup.id){
-        addid <- which.max(js[, dup.id])
-        js.binary[, dup.id] <- 0
-        js.binary[addid, dup.id] <- 1  
-      }
-    }
-  }
-  js.melt <- melt(js.binary)
-  js.melt <- js.melt[js.melt[,3]!=0,]
-  as.character(js.melt[,2])
-})
-res <- unlist(res)  
-js.perc <- table(res)/n.permute
-saveRDS(js.perc, '/Users/wenpinhou/Dropbox/trajectory_variability/tree_variability/result/js_percentage.rds')
-
-res <- sapply(seq(1,length(oclist)), function(i){
-  oc <- oclist[[i]]
-  oc.binary <- (oc > oc.cut) + 0
-  while (length(which(rowSums(oc.binary) > 1)) > 0 | length(which(colSums(oc.binary) > 1)) > 0){
-    dup.id <- which(rowSums(oc.binary) > 1)
-    if (length(dup.id) == 1){
-      addid <- which.max(oc[dup.id, ])
-      oc.binary[dup.id, ] <- 0
-      oc.binary[dup.id, addid] <- 1  
-    } else if (length(dup.id) > 1) {
-      for (dup.i in dup.id){
-        addid <- which.max(oc[dup.i, ])
-        oc.binary[dup.i, ] <- 0
-        oc.binary[dup.i, addid] <- 1  
-      }
-    }
-    
-    dup.id <- which(colSums(oc.binary) > 1)
-    if (length(dup.id) == 1){
-      addid <- which.max(oc[, dup.id])
-      oc.binary[, dup.id] <- 0
-      oc.binary[addid, dup.id] <- 1  
-    } else if (length(dup.id) > 1) {
-      for (dup.i in dup.id){
-        addid <- which.max(oc[, dup.i])
-        oc.binary[, dup.i] <- 0
-        oc.binary[addid, dup.i] <- 1  
-      }
-    }
-  }
-    
-  oc.melt <- melt(oc.binary)
-  oc.melt <- oc.melt[oc.melt[,3]!=0,]
-  if (length(oc.melt[,2]) > 6) print(i)
-  as.character(oc.melt[,2])
-})
-res <- unlist(res)  
-oc.perc <- table(res)/n.permute
-saveRDS(oc.perc, '/Users/wenpinhou/Dropbox/trajectory_variability/tree_variability/result/oc_percentage.rds')
-
-
-
diff --git a/hca_bone_marrow_data_analysis/tree_variability/auto_pc_auto_clu_permute1e3/code/01_reproducibility.R b/hca_bone_marrow_data_analysis/tree_variability/auto_pc_auto_clu_permute1e3/code/01_reproducibility.R
deleted file mode 100644
index ecb0151..0000000
--- a/hca_bone_marrow_data_analysis/tree_variability/auto_pc_auto_clu_permute1e3/code/01_reproducibility.R
+++ /dev/null
@@ -1,236 +0,0 @@
-rm(list=ls())
-library(ggplot2)
-library(Seurat)
-library(reshape2)
-library(TSCAN)
-library(scattermore)
-library(RColorBrewer)
-suppressMessages(library(igraph))
-setwd("/Users/wenpinhou/Dropbox/trajectory_variability/hca/data/HCA/proc/integrate")
-
-umap = readRDS('umap.rds')
-pca <- as.matrix(umap@reductions$pca@cell.embeddings)
-
-ctlevel <- data.frame(ct=c('HSC','MPP','LMPP','CMP','CLP','GMP','MEP',"Bcell","CD4Tcell","CD8Tcell",'NKcell','Mono','Ery'),level=c(1,2,3,3,4,4,4,5,5,5,5,5,5),immunepath=c(1,1,1,0,1,0,0,1,1,1,1,0,0),monopath=c(1,1,1,1,0,1,0,0,0,0,0,1,0),erypath=c(1,1,0,1,0,0,1,0,0,0,0,0,1),stringsAsFactors = F)
-str(pca)
-
-a = readRDS('/Users/wenpinhou/Dropbox/trajectory_variability/hca/data/HCA/proc/ct/sc.rds')
-ct = data.frame(cell = names(a), celltype = a, stringsAsFactors = FALSE)
-
-mykmeans <- function(matrix, number.cluster = NA){
-  ## cluster the rows
-  set.seed(12345)
-  library(parallel)
-  if (is.na(number.cluster)){
-    maxclunum <- 20
-    rss <- mclapply(1:maxclunum,function(clunum) {
-      tmp <- kmeans(matrix,clunum,iter.max = 1000)
-      tmp$betweenss/tmp$totss
-    },mc.cores=20)
-    rss <- unlist(rss)
-    x <- 1:maxclunum
-    optclunum <- which.min(sapply(1:maxclunum, function(i) {
-        x2 <- pmax(0, x - i)
-        sum(lm(rss ~ x + x2)$residuals^2)  ## check this
-    }))
-    clu <- kmeans(matrix,optclunum)
-  } else {
-    clu <- kmeans(matrix, number.cluster)    
-  }
-    return(clu)
-}
-
-# set.seed(12345)
-# library(umap)
-# u <- umap(pca[,1:10])$layout
-
-# ggplot(data.frame(u1=u[,1],u2=u[,2],ct=ct[match(rownames(u),ct[,1]),2]),aes(x=u1,y=u2,col=ct)) + geom_point() + facet_wrap(~ct)
-
-### determine numPC
-set.seed(12345)
-sdev <- apply(pca, 2, sd)
-x <- 1:20
-optpoint <- which.min(sapply(2:20, function(i) {
-  x2 <- pmax(0, x - i)
-  sum(lm(sdev[1:20] ~ x + x2)$residuals^2)
-}))
-pcadim = optpoint + 1
-pr <- pca[,1:pcadim]  # 2
-# pr <- pca[,1:2]
-
-# ## clustering
-# clu <- mykmeans(pr, number.cluster = 14)$cluster
-# pd = data.frame(x = pr[,1], y = pr[,2], clu = as.factor(clu[rownames(pr)]))
-# 
-# mypalette = colorRampPalette(brewer.pal(9,'Set1'))
-# ggplot(data = pd, aes(x = x, y = y, color = clu)) +
-#   geom_scattermore()+
-#   scale_color_manual(values = mypalette(14))
-# 
-# ## cell type composition in clusters
-# pd = cbind(pd, celltype = ct[match(rownames(pd), ct[,1]),2])
-# tab <- table(pd[,3:4])
-# tab <- tab/rowSums(tab)
-# pd <- melt(tab)
-# pd$clu <- factor(as.character(pd$clu), levels = seq(1,max(pd$clu)))
-# 
-# ggplot(data = pd) +
-#   geom_bar(aes(x = clu, y = value, fill = celltype), stat = 'identity', position = 'dodge') +
-#   theme_classic() +
-#   ylab('Celltype Proportion') +
-#   scale_fill_manual(values = mypalette(length(unique(pd$celltype))))
-
-# tmp <- which.min(sapply(1:clun,function(scn) mean(ctlevel[match(ct[match(names(clu)[clu==scn],ct[,1]),3],ctlevel[,1]),2],na.rm=T)))
-#
-
-### mclust
-# mcl <- exprmclust(t(pr),cluster=clu,reduce=F)
-mcl <- exprmclust(t(pr), reduce = F)
-plotmclust(mcl, cell_point_size = 0.1)
-str(mcl)
-## find origin
-pd = data.frame(x = pr[,1], y = pr[,2], clu = as.factor(mcl$clusterid))
-pd = cbind(pd, celltype = ct[match(rownames(pd), ct[,1]),2])
-tab <- table(pd[,3:4])
-tab <- tab/rowSums(tab)
-pd <- melt(tab)
-pd$clu <- factor(as.character(pd$clu), levels = seq(1,max(pd$clu)))
-
-tmp <- pd[pd$celltype == 'HSC', ]
-origin.cluster <- as.numeric(tmp[which.max(tmp[,3]), 1])
-
-
-ord <- TSCANorder(mcl, startcluster = origin.cluster, listbranch = T,orderonly = T)
-str(ord)
-length(ord)
-pt <- data.frame(cell = unname(unlist(ord)), time = unlist(sapply(sapply(ord, length), function(i) seq(1, i))), stringsAsFactors = FALSE)
-
-## plot pseudotime
-pd = data.frame(pc1 = pca[,1], pc2 = pca[,2], time = as.numeric(pt[match(rownames(pca), pt[,1]),2]))
-library(scattermore)
-library(RColorBrewer)
-ggplot(data = pd, aes(x = pc1, y = pc2, color = time)) +
-  geom_scattermore() +
-  scale_color_gradient(low = 'yellow', high = 'blue')
-
-# -----------
-# permutation 
-# -----------
-jslist <- oclist <- list()
-for (pmid in seq(1, 1e3)){
-  ## boostrap cells
-  print(pmid)
-  set.seed(pmid)
-  pr.pm <- pr[sample(rownames(pr), nrow(pr), replace = TRUE),]
-  pr.pm <- pr.pm[!duplicated(rownames(pr.pm)),]
-  
-  # ## cluster cells
-  # clu <- mykmeans(pr.pm, number.cluster = 14)$cluster ###
-  # mcl.pm <- exprmclust(t(pr.pm), cluster = clu, reduce = FALSE) ###
-  mcl.pm <- exprmclust(t(pr.pm), reduce = FALSE) ###
-  # plotmclust(mcl.pm, cell_point_size = 0.1)
-  
-  ## select origin cluster
-  pt.pm.mean<- tapply(pt[match(names(mcl.pm[['clusterid']]), pt[,1]),2], list(mcl.pm[['clusterid']]), mean)
-  start.cluster <- names(which.min(pt.pm.mean))
-  
-  ## construct pseudotime
-  ord.pm <- TSCANorder(mcl.pm, startcluster = start.cluster, listbranch = T,orderonly = T)
-  # str(ord.pm)
-  
-  ## plot pseudotime
-  
-  pt.pm <- data.frame(cell = unname(unlist(ord.pm)), time = unlist(sapply(sapply(ord.pm, length), function(i) seq(1, i))), stringsAsFactors = FALSE)
-  pd = data.frame(pc1 = pca[,1], pc2 = pca[,2], time = as.numeric(pt.pm[match(rownames(pca), pt.pm[,1]),2]))
-  # ggplot(data = pd, aes(x = pc1, y = pc2, color = time)) +
-  #   geom_scattermore()
-
-  ## compare two MST
-  js <- sapply(seq(1, length(ord)), function(i){
-          sapply(seq(1, length(ord.pm)), function(j){
-            b.ori <- ord[[i]]
-            b.pm <- ord.pm[[j]]
-            js <- length(intersect(b.pm, b.ori))/length(union(b.pm, b.ori))
-          })
-      })
-  
-  oc <- sapply(seq(1, length(ord)), function(i){
-           sapply(seq(1, length(ord.pm)), function(j){
-              b.ori <- ord[[i]]
-              b.pm <- ord.pm[[j]]
-              oc <- length(intersect(b.pm, b.ori))/min(length(b.pm), length(b.ori))
-           }) 
-        })
-  colnames(oc) <- colnames(js) <- paste0('original', seq(1, length(ord)))
-  jslist[[pmid]] <- js
-  oclist[[pmid]] <- oc
-}
-saveRDS(jslist, '/Users/wenpinhou/Dropbox/trajectory_variability/tree_variability/result/pm_js.rds')   
-saveRDS(oclist, '/Users/wenpinhou/Dropbox/trajectory_variability/tree_variability/result/pm_oc.rds')   
-
-
-
-jsm <- do.call(rbind, jslist)
-ocm <- do.call(rbind, oclist)
-str(jsm)
-str(ocm)
-par(mfrow = c(1,2))
-hist(jsm)
-hist(ocm)
-
-js.cut <- 0.5
-oc.cut <- 0.6
-
-res <- sapply(seq(1,length(jslist)), function(i){
-  js <- jslist[[i]]
-  js.binary <- (js > js.cut) + 0
-  dup.id <- which(rowSums(js.binary) > 1)
-  if (length(dup.id) == 1){
-    addid <- which.max(js[dup.id, ])
-    js.binary[dup.id, ] <- 0
-    js.binary[dup.id, addid] <- 1  
-  } else if (length(dup.id) > 1) {
-    for (dup.i in dup.id){
-      addid <- which.max(js[dup.i, ])
-      js.binary[dup.i, ] <- 0
-      js.binary[dup.i, addid] <- 1  
-    }
-  }
-  js.melt <- melt(js.binary)
-  js.melt <- js.melt[js.melt[,3]!=0,]
-  as.character(js.melt[,2])
-})
-res <- unlist(res)  
-js.perc <- table(res)/1e3
-saveRDS(js.perc, '/Users/wenpinhou/Dropbox/trajectory_variability/tree_variability/result/js_percentage.rds')
-
-res <- sapply(seq(1,length(oclist)), function(i){
-  oc <- oclist[[i]]
-  oc.binary <- (oc > oc.cut) + 0
-  dup.id <- which(rowSums(oc.binary) > 1)
-  if (length(dup.id) == 1){
-    addid <- which.max(oc[dup.id, ])
-    oc.binary[dup.id, ] <- 0
-    oc.binary[dup.id, addid] <- 1  
-  } else if (length(dup.id) > 1) {
-    for (dup.i in dup.id){
-      addid <- which.max(oc[dup.i, ])
-      oc.binary[dup.i, ] <- 0
-      oc.binary[dup.i, addid] <- 1  
-    }
-  }
-  
-  oc.melt <- melt(oc.binary)
-  oc.melt <- oc.melt[oc.melt[,3]!=0,]
-  as.character(oc.melt[,2])
-})
-res <- unlist(res)  
-oc.perc <- table(res)/1e3
-
-sort((js.perc + oc.perc)/2)
-saveRDS(oc.perc, '/Users/wenpinhou/Dropbox/trajectory_variability/tree_variability/result/oc_percentage.rds')
-
-
-
-
-
diff --git a/hca_bone_marrow_data_analysis/tree_variability/auto_pc_auto_clu_permute1e3_0.99QuantileCutoff/code/01_reproducibility.R b/hca_bone_marrow_data_analysis/tree_variability/auto_pc_auto_clu_permute1e3_0.99QuantileCutoff/code/01_reproducibility.R
deleted file mode 100644
index f411805..0000000
--- a/hca_bone_marrow_data_analysis/tree_variability/auto_pc_auto_clu_permute1e3_0.99QuantileCutoff/code/01_reproducibility.R
+++ /dev/null
@@ -1,225 +0,0 @@
-rm(list=ls())
-library(ggplot2)
-library(Seurat)
-library(reshape2)
-library(TSCAN)
-library(scattermore)
-library(RColorBrewer)
-suppressMessages(library(igraph))
-setwd("/Users/wenpinhou/Dropbox/trajectory_variability/hca/data/HCA/proc/integrate")
-umap = readRDS('umap.rds')
-pca <- as.matrix(umap@reductions$pca@cell.embeddings)
-ctlevel <- data.frame(ct=c('HSC','MPP','LMPP','CMP','CLP','GMP','MEP',"Bcell","CD4Tcell","CD8Tcell",'NKcell','Mono','Ery'),level=c(1,2,3,3,4,4,4,5,5,5,5,5,5),immunepath=c(1,1,1,0,1,0,0,1,1,1,1,0,0),monopath=c(1,1,1,1,0,1,0,0,0,0,0,1,0),erypath=c(1,1,0,1,0,0,1,0,0,0,0,0,1),stringsAsFactors = F)
-str(pca)
-a = readRDS('/Users/wenpinhou/Dropbox/trajectory_variability/hca/data/HCA/proc/ct/sc.rds')
-ct = data.frame(cell = names(a), celltype = a, stringsAsFactors = FALSE)
-
-mykmeans <- function(matrix, number.cluster = NA){
-  ## cluster the rows
-  set.seed(12345)
-  library(parallel)
-  if (is.na(number.cluster)){
-    maxclunum <- 20
-    rss <- mclapply(1:maxclunum,function(clunum) {
-      tmp <- kmeans(matrix,clunum,iter.max = 1000)
-      tmp$betweenss/tmp$totss
-    },mc.cores=20)
-    rss <- unlist(rss)
-    x <- 1:maxclunum
-    optclunum <- which.min(sapply(1:maxclunum, function(i) {
-        x2 <- pmax(0, x - i)
-        sum(lm(rss ~ x + x2)$residuals^2)  ## check this
-    }))
-    clu <- kmeans(matrix,optclunum)
-  } else {
-    clu <- kmeans(matrix, number.cluster)    
-  }
-    return(clu)
-}
-
-### determine numPC
-set.seed(12345)
-sdev <- apply(pca, 2, sd)
-x <- 1:20
-optpoint <- which.min(sapply(2:20, function(i) {
-  x2 <- pmax(0, x - i)
-  sum(lm(sdev[1:20] ~ x + x2)$residuals^2)
-}))
-pcadim = optpoint + 1
-pr <- pca[,1:pcadim]  # 2
-
-### mclust
-# mcl <- exprmclust(t(pr),cluster=clu,reduce=F)
-mcl <- exprmclust(t(pr), reduce = F)
-plotmclust(mcl, cell_point_size = 0.1)
-str(mcl)
-## find origin
-pd = data.frame(x = pr[,1], y = pr[,2], clu = as.factor(mcl$clusterid))
-pd = cbind(pd, celltype = ct[match(rownames(pd), ct[,1]),2])
-tab <- table(pd[,3:4])
-tab <- tab/rowSums(tab)
-pd <- melt(tab)
-pd$clu <- factor(as.character(pd$clu), levels = seq(1,max(pd$clu)))
-
-tmp <- pd[pd$celltype == 'HSC', ]
-origin.cluster <- as.numeric(tmp[which.max(tmp[,3]), 1])
-
-## construct pseudotime
-ord <- TSCANorder(mcl, startcluster = origin.cluster, listbranch = T,orderonly = T)
-str(ord)
-length(ord)
-pt <- data.frame(cell = unname(unlist(ord)), time = unlist(sapply(sapply(ord, length), function(i) seq(1, i))), stringsAsFactors = FALSE)
-
-## plot pseudotime
-pd = data.frame(pc1 = pca[,1], pc2 = pca[,2], time = as.numeric(pt[match(rownames(pca), pt[,1]),2]))
-library(scattermore)
-library(RColorBrewer)
-ggplot(data = pd, aes(x = pc1, y = pc2, color = time)) +
-  geom_scattermore() +
-  scale_color_gradient(low = 'yellow', high = 'blue')
-
-# -------------------------------------------------------
-# null distribution of Jaccard index, overlap coefficient
-# -------------------------------------------------------
-js.null <- lapply(seq(1, length(ord)), function(i){
-  b.ori <- ord[[i]]
-  tmp <- sapply(seq(1, 1e3), function(j){
-    set.seed(j)
-    b.pm <- sample(rownames(pr), length(b.ori))
-    length(intersect(b.pm, b.ori))/length(union(b.pm, b.ori))
-  })
-})
-
-par(mfrow = c(1,3))
-hist(js.null[[1]])
-hist(js.null[[2]])
-hist(js.null[[3]])
-
-js.cut <- sapply(js.null, quantile, 0.99)
-
-oc.null <- lapply(seq(1, length(ord)), function(i){
-  b.ori <- ord[[i]]
-  tmp <- sapply(seq(1, 1e3), function(j){
-    set.seed(j)
-    b.pm <- sample(rownames(pr), length(b.ori))
-    length(intersect(b.pm, b.ori))/min(length(b.pm), length(b.ori))
-  })
-})
-par(mfrow = c(1,3))
-hist(oc.null[[1]])
-hist(oc.null[[2]])
-hist(oc.null[[3]])
-
-oc.cut <- sapply(oc.null, quantile, 0.99)
-
-# -----------
-# permutation 
-# -----------
-jslist <- oclist <- list()
-for (pmid in seq(1, 1e3)){
-  ## boostrap cells
-  print(pmid)
-  set.seed(pmid)
-  pr.pm <- pr[sample(rownames(pr), nrow(pr), replace = TRUE),]
-  pr.pm <- pr.pm[!duplicated(rownames(pr.pm)),]
-  
-  # ## cluster cells
-  mcl.pm <- exprmclust(t(pr.pm), reduce = FALSE) ###
-  # plotmclust(mcl.pm, cell_point_size = 0.1)
-  
-  ## select origin cluster
-  pt.pm.mean<- tapply(pt[match(names(mcl.pm[['clusterid']]), pt[,1]),2], list(mcl.pm[['clusterid']]), mean)
-  start.cluster <- names(which.min(pt.pm.mean))
-  
-  ## construct pseudotime
-  ord.pm <- TSCANorder(mcl.pm, startcluster = start.cluster, listbranch = T,orderonly = T)
-  # str(ord.pm)
-  
-  ## plot pseudotime
-  
-  pt.pm <- data.frame(cell = unname(unlist(ord.pm)), time = unlist(sapply(sapply(ord.pm, length), function(i) seq(1, i))), stringsAsFactors = FALSE)
-  pd = data.frame(pc1 = pca[,1], pc2 = pca[,2], time = as.numeric(pt.pm[match(rownames(pca), pt.pm[,1]),2]))
-  # ggplot(data = pd, aes(x = pc1, y = pc2, color = time)) +
-  #   geom_scattermore()
-
-  ## compare two MST
-  js <- sapply(seq(1, length(ord)), function(i){
-          sapply(seq(1, length(ord.pm)), function(j){
-            b.ori <- ord[[i]]
-            b.pm <- ord.pm[[j]]
-            js <- length(intersect(b.pm, b.ori))/length(union(b.pm, b.ori))
-          })
-      })
-  
-  oc <- sapply(seq(1, length(ord)), function(i){
-           sapply(seq(1, length(ord.pm)), function(j){
-              b.ori <- ord[[i]]
-              b.pm <- ord.pm[[j]]
-              oc <- length(intersect(b.pm, b.ori))/min(length(b.pm), length(b.ori))
-           }) 
-        })
-  colnames(oc) <- colnames(js) <- paste0('original', seq(1, length(ord)))
-  jslist[[pmid]] <- js
-  oclist[[pmid]] <- oc
-}
-saveRDS(jslist, '/Users/wenpinhou/Dropbox/trajectory_variability/tree_variability/result/pm_js.rds')   
-saveRDS(oclist, '/Users/wenpinhou/Dropbox/trajectory_variability/tree_variability/result/pm_oc.rds')   
-
-jsm <- do.call(rbind, jslist)
-ocm <- do.call(rbind, oclist)
-par(mfrow = c(1,2))
-hist(jsm)
-hist(ocm)
-
-res <- sapply(seq(1,length(jslist)), function(i){
-  js <- jslist[[i]]
-  js.binary <- sapply(seq(1,ncol(js)), function(c){
-    (js[,c] > js.cut[c]) + 0
-  })
-  dup.id <- which(rowSums(js.binary) > 1)
-  if (length(dup.id) == 1){
-    addid <- which.max(js[dup.id, ])
-    js.binary[dup.id, ] <- 0
-    js.binary[dup.id, addid] <- 1  
-  } else if (length(dup.id) > 1) {
-    for (dup.i in dup.id){
-      addid <- which.max(js[dup.i, ])
-      js.binary[dup.i, ] <- 0
-      js.binary[dup.i, addid] <- 1  
-    }
-  }
-  js.melt <- melt(js.binary)
-  js.melt <- js.melt[js.melt[,3]!=0,]
-  as.character(js.melt[,2])
-})
-res <- unlist(res)  
-js.perc <- table(res)/1e3
-saveRDS(js.perc, '/Users/wenpinhou/Dropbox/trajectory_variability/tree_variability/result/js_percentage.rds')
-
-res <- sapply(seq(1,length(oclist)), function(i){
-  oc <- oclist[[i]]
-  oc.binary <- sapply(seq(1,ncol(oc)), function(c){
-    (oc[,c] > oc.cut[c]) + 0
-  })
-  dup.id <- which(rowSums(oc.binary) > 1)
-  if (length(dup.id) == 1){
-    addid <- which.max(oc[dup.id, ])
-    oc.binary[dup.id, ] <- 0
-    oc.binary[dup.id, addid] <- 1  
-  } else if (length(dup.id) > 1) {
-    for (dup.i in dup.id){
-      addid <- which.max(oc[dup.i, ])
-      oc.binary[dup.i, ] <- 0
-      oc.binary[dup.i, addid] <- 1  
-    }
-  }
-  oc.melt <- melt(oc.binary)
-  oc.melt <- oc.melt[oc.melt[,3]!=0,]
-  as.character(oc.melt[,2])
-})
-res <- unlist(res)  
-oc.perc <- table(res)/1e3
-sort((js.perc + oc.perc)/2)
-saveRDS(oc.perc, '/Users/wenpinhou/Dropbox/trajectory_variability/tree_variability/result/oc_percentage.rds')
-
-
diff --git a/hca_bone_marrow_data_analysis/tree_variability/auto_pc_auto_clu_permute1e3_0.99QuantileCutoff_js_oc_corr/code/01_reproducibility.R b/hca_bone_marrow_data_analysis/tree_variability/auto_pc_auto_clu_permute1e3_0.99QuantileCutoff_js_oc_corr/code/01_reproducibility.R
deleted file mode 100644
index 6a0319e..0000000
--- a/hca_bone_marrow_data_analysis/tree_variability/auto_pc_auto_clu_permute1e3_0.99QuantileCutoff_js_oc_corr/code/01_reproducibility.R
+++ /dev/null
@@ -1,274 +0,0 @@
-rm(list=ls())
-library(ggplot2)
-library(Seurat)
-library(reshape2)
-library(TSCAN)
-library(scattermore)
-library(RColorBrewer)
-suppressMessages(library(igraph))
-n.permute <- 1e3
-setwd("/Users/wenpinhou/Dropbox/trajectory_variability/hca/data/HCA/proc/integrate")
-umap = readRDS('umap.rds')
-pca <- as.matrix(umap@reductions$pca@cell.embeddings)
-ctlevel <- data.frame(ct=c('HSC','MPP','LMPP','CMP','CLP','GMP','MEP',"Bcell","CD4Tcell","CD8Tcell",'NKcell','Mono','Ery'),level=c(1,2,3,3,4,4,4,5,5,5,5,5,5),immunepath=c(1,1,1,0,1,0,0,1,1,1,1,0,0),monopath=c(1,1,1,1,0,1,0,0,0,0,0,1,0),erypath=c(1,1,0,1,0,0,1,0,0,0,0,0,1),stringsAsFactors = F)
-str(pca)
-a = readRDS('/Users/wenpinhou/Dropbox/trajectory_variability/hca/data/HCA/proc/ct/sc.rds')
-ct = data.frame(cell = names(a), celltype = a, stringsAsFactors = FALSE)
-
-mykmeans <- function(matrix, number.cluster = NA){
-  ## cluster the rows
-  set.seed(12345)
-  library(parallel)
-  if (is.na(number.cluster)){
-    maxclunum <- 20
-    rss <- mclapply(1:maxclunum,function(clunum) {
-      tmp <- kmeans(matrix,clunum,iter.max = 1000)
-      tmp$betweenss/tmp$totss
-    },mc.cores=20)
-    rss <- unlist(rss)
-    x <- 1:maxclunum
-    optclunum <- which.min(sapply(1:maxclunum, function(i) {
-        x2 <- pmax(0, x - i)
-        sum(lm(rss ~ x + x2)$residuals^2)  ## check this
-    }))
-    clu <- kmeans(matrix,optclunum)
-  } else {
-    clu <- kmeans(matrix, number.cluster)    
-  }
-    return(clu)
-}
-
-### determine numPC
-set.seed(12345)
-sdev <- apply(pca, 2, sd)
-x <- 1:20
-optpoint <- which.min(sapply(2:20, function(i) {
-  x2 <- pmax(0, x - i)
-  sum(lm(sdev[1:20] ~ x + x2)$residuals^2)
-}))
-pcadim = optpoint + 1
-pr <- pca[,1:pcadim]  # 2
-
-### mclust
-# mcl <- exprmclust(t(pr),cluster=clu,reduce=F)
-mcl <- exprmclust(t(pr), reduce = F)
-plotmclust(mcl, cell_point_size = 0.1)
-str(mcl)
-## find origin
-pd = data.frame(x = pr[,1], y = pr[,2], clu = as.factor(mcl$clusterid))
-pd = cbind(pd, celltype = ct[match(rownames(pd), ct[,1]),2])
-tab <- table(pd[,3:4])
-tab <- tab/rowSums(tab)
-pd <- melt(tab)
-pd$clu <- factor(as.character(pd$clu), levels = seq(1,max(pd$clu)))
-
-tmp <- pd[pd$celltype == 'HSC', ]
-origin.cluster <- as.numeric(tmp[which.max(tmp[,3]), 1])
-
-## construct pseudotime
-ord <- TSCANorder(mcl, startcluster = origin.cluster, listbranch = T,orderonly = T)
-str(ord)
-length(ord)
-pt <- unlist(sapply(sapply(ord, length), function(i) seq(1, i)))
-names(pt) <- unname(unlist(ord))
-
-
-## plot pseudotime
-pd = data.frame(pc1 = pca[,1], pc2 = pca[,2], time = as.numeric(pt[rownames(pca)]))
-library(scattermore)
-library(RColorBrewer)
-ggplot(data = pd, aes(x = pc1, y = pc2, color = time)) +
-  geom_scattermore() +
-  scale_color_gradient(low = 'yellow', high = 'blue')
-
-# -------------------------------------------------------
-# null distribution of Jaccard index, overlap coefficient
-# -------------------------------------------------------
-js.null <- lapply(seq(1, length(ord)), function(i){
-  b.ori <- ord[[i]]
-  tmp <- sapply(seq(1, 1e3), function(j){
-    set.seed(j)
-    b.pm <- sample(rownames(pr), length(b.ori))
-    length(intersect(b.pm, b.ori))/length(union(b.pm, b.ori))
-  })
-})
-
-par(mfrow = c(1,3))
-hist(js.null[[1]])
-hist(js.null[[2]])
-hist(js.null[[3]])
-
-js.cut <- sapply(js.null, quantile, 0.99)
-
-oc.null <- lapply(seq(1, length(ord)), function(i){
-  b.ori <- ord[[i]]
-  tmp <- sapply(seq(1, 1e3), function(j){
-    set.seed(j)
-    b.pm <- sample(rownames(pr), length(b.ori))
-    length(intersect(b.pm, b.ori))/min(length(b.pm), length(b.ori))
-  })
-})
-par(mfrow = c(1,3))
-hist(oc.null[[1]])
-hist(oc.null[[2]])
-hist(oc.null[[3]])
-
-oc.cut <- sapply(oc.null, quantile, 0.99)
-
-# -----------
-# permutation 
-# -----------
-corrlist <- jslist <- oclist <- list()
-for (pmid in seq(1, n.permute)){
-  ## boostrap cells
-  print(pmid)
-  set.seed(pmid)
-  pr.pm <- pr[sample(rownames(pr), nrow(pr), replace = TRUE),]
-  pr.pm <- pr.pm[!duplicated(rownames(pr.pm)),]
-  
-  # ## cluster cells
-  mcl.pm <- exprmclust(t(pr.pm), reduce = FALSE) ###
-  # plotmclust(mcl.pm, cell_point_size = 0.1)
-  
-  ## select origin cluster
-  pt.pm.mean<- tapply(pt[names(mcl.pm[['clusterid']])], list(mcl.pm[['clusterid']]), mean)
-  start.cluster <- names(which.min(pt.pm.mean))
-  
-  ## construct pseudotime
-  ord.pm <- TSCANorder(mcl.pm, startcluster = start.cluster, listbranch = T,orderonly = T)
-  # str(ord.pm)
-  
-  ## plot pseudotime
-  pt.pm <- unlist(sapply(sapply(ord.pm, length), function(i) seq(1, i)))
-  names(pt.pm) <- unname(unlist(ord.pm))
-  pd = data.frame(pc1 = pca[,1], pc2 = pca[,2], time = as.numeric(pt.pm[rownames(pca)]))
-  # ggplot(data = pd, aes(x = pc1, y = pc2, color = time)) +
-  #   geom_scattermore()
-
-  ## compare two MST
-  js <- sapply(seq(1, length(ord)), function(i){
-          sapply(seq(1, length(ord.pm)), function(j){
-            b.ori <- ord[[i]]
-            b.pm <- ord.pm[[j]]
-            js <- length(intersect(b.pm, b.ori))/length(union(b.pm, b.ori))
-          })
-      })
-  oc <- sapply(seq(1, length(ord)), function(i){
-           sapply(seq(1, length(ord.pm)), function(j){
-              b.ori <- ord[[i]]
-              b.pm <- ord.pm[[j]]
-              oc <- length(intersect(b.pm, b.ori))/min(length(b.pm), length(b.ori))
-           }) 
-        })
-  corr <- sapply(seq(1, length(ord)), function(i){
-           sapply(seq(1, length(ord.pm)), function(j){
-              ov = intersect(ord[[i]], ord.pm[[j]])
-              cor(pt[ov], pt.pm[ov])
-           }) 
-        })
-  corr[is.na(corr)] <- 0
-  colnames(corr) <- colnames(oc) <- colnames(js) <- paste0('original', seq(1, length(ord)))
-  jslist[[pmid]] <- js
-  oclist[[pmid]] <- oc
-  corrlist[[pmid]] <- corr
-}
-saveRDS(jslist, '/Users/wenpinhou/Dropbox/trajectory_variability/tree_variability/result/pm_js.rds')   
-saveRDS(oclist, '/Users/wenpinhou/Dropbox/trajectory_variability/tree_variability/result/pm_oc.rds')   
-
-saveRDS(corrlist, '/Users/wenpinhou/Dropbox/trajectory_variability/tree_variability/result/pm_oc.rds')   
-
-jsm <- do.call(rbind, jslist)
-ocm <- do.call(rbind, oclist)
-par(mfrow = c(1,2))
-hist(jsm)
-hist(ocm)
-
-res <- corr.score <- list()
-for (i in seq(1, length(jslist))){
-  js <- jslist[[i]]
-  js.binary <- sapply(seq(1,ncol(js)), function(c){
-    (js[,c] > js.cut[c]) + 0
-  })
-  while (length(which(rowSums(js.binary) > 1)) > 0 | length(which(colSums(js.binary) > 1)) > 0){
-    dup.id <- which(rowSums(js.binary) > 1)
-    if (length(dup.id) == 1){
-      addid <- which.max(js[dup.id, ])
-      js.binary[dup.id, ] <- 0
-      js.binary[dup.id, addid] <- 1  
-    } else if (length(dup.id) > 1) {
-      for (dup.i in dup.id){
-        addid <- which.max(js[dup.i, ])
-        js.binary[dup.i, ] <- 0
-        js.binary[dup.i, addid] <- 1  
-      }
-    }
-      
-    dup.id <- which(colSums(js.binary) > 1)
-    if (length(dup.id) == 1){
-      addid <- which.max(js[, dup.id])
-      js.binary[dup.id, ] <- 0
-      js.binary[addid, dup.id] <- 1  
-    } else if (length(dup.id) > 1) {
-      for (dup.i in dup.id){
-        addid <- which.max(js[, dup.id])
-        js.binary[, dup.id] <- 0
-        js.binary[addid, dup.id] <- 1  
-      }
-    }
-  }
-  
-  
-  corr.score[[i]] <- corrlist[[i]] * js.binary
-  js.melt <- melt(js.binary)
-  js.melt <- js.melt[js.melt[,3]!=0,]
-  res[[i]] <- as.character(js.melt[,2])
-}
-res <- unlist(res)  
-js.perc <- table(res)/n.permute
-saveRDS(js.perc, '/Users/wenpinhou/Dropbox/trajectory_variability/tree_variability/result/js_percentage.rds')
-
-corr.score.m <- do.call(rbind, corr.score)
-corr.score.v <- colSums(corr.score.m)/n.permute
-saveRDS(corr.score.v, '/Users/wenpinhou/Dropbox/trajectory_variability/tree_variability/result/corr_score.rds')
-
-res <- sapply(seq(1,length(oclist)), function(i){
-  oc <- oclist[[i]]
-  oc.binary <- sapply(seq(1,ncol(oc)), function(c){
-    (oc[,c] > oc.cut[c]) + 0
-  })
-  while (length(which(rowSums(oc.binary) > 1)) > 0 | length(which(colSums(oc.binary) > 1)) > 0){
-    dup.id <- which(rowSums(oc.binary) > 1)
-    if (length(dup.id) == 1){
-      addid <- which.max(oc[dup.id, ])
-      oc.binary[dup.id, ] <- 0
-      oc.binary[dup.id, addid] <- 1  
-    } else if (length(dup.id) > 1) {
-      for (dup.i in dup.id){
-        addid <- which.max(oc[dup.i, ])
-        oc.binary[dup.i, ] <- 0
-        oc.binary[dup.i, addid] <- 1  
-      }
-    }
-    dup.id <- which(colSums(oc.binary) > 1)
-    if (length(dup.id) == 1){
-      addid <- which.max(oc[, dup.id])
-      oc.binary[, dup.id] <- 0
-      oc.binary[addid, dup.id] <- 1  
-    } else if (length(dup.id) > 1) {
-      for (dup.i in dup.id){
-        addid <- which.max(oc[, dup.i])
-        oc.binary[, dup.i] <- 0
-        oc.binary[addid, dup.i] <- 1  
-      }
-    }
-  }
-  oc.melt <- melt(oc.binary)
-  oc.melt <- oc.melt[oc.melt[,3]!=0,]
-  as.character(oc.melt[,2])
-})
-res <- unlist(res)  
-oc.perc <- table(res)/n.permute
-sort((js.perc + oc.perc)/2)
-saveRDS(oc.perc, '/Users/wenpinhou/Dropbox/trajectory_variability/tree_variability/result/oc_percentage.rds')
-
-
diff --git a/hca_bone_marrow_data_analysis/tree_variability/auto_pc_auto_nclu_module/code/01_reproducibility.R b/hca_bone_marrow_data_analysis/tree_variability/auto_pc_auto_nclu_module/code/01_reproducibility.R
deleted file mode 100644
index 6a0319e..0000000
--- a/hca_bone_marrow_data_analysis/tree_variability/auto_pc_auto_nclu_module/code/01_reproducibility.R
+++ /dev/null
@@ -1,274 +0,0 @@
-rm(list=ls())
-library(ggplot2)
-library(Seurat)
-library(reshape2)
-library(TSCAN)
-library(scattermore)
-library(RColorBrewer)
-suppressMessages(library(igraph))
-n.permute <- 1e3
-setwd("/Users/wenpinhou/Dropbox/trajectory_variability/hca/data/HCA/proc/integrate")
-umap = readRDS('umap.rds')
-pca <- as.matrix(umap@reductions$pca@cell.embeddings)
-ctlevel <- data.frame(ct=c('HSC','MPP','LMPP','CMP','CLP','GMP','MEP',"Bcell","CD4Tcell","CD8Tcell",'NKcell','Mono','Ery'),level=c(1,2,3,3,4,4,4,5,5,5,5,5,5),immunepath=c(1,1,1,0,1,0,0,1,1,1,1,0,0),monopath=c(1,1,1,1,0,1,0,0,0,0,0,1,0),erypath=c(1,1,0,1,0,0,1,0,0,0,0,0,1),stringsAsFactors = F)
-str(pca)
-a = readRDS('/Users/wenpinhou/Dropbox/trajectory_variability/hca/data/HCA/proc/ct/sc.rds')
-ct = data.frame(cell = names(a), celltype = a, stringsAsFactors = FALSE)
-
-mykmeans <- function(matrix, number.cluster = NA){
-  ## cluster the rows
-  set.seed(12345)
-  library(parallel)
-  if (is.na(number.cluster)){
-    maxclunum <- 20
-    rss <- mclapply(1:maxclunum,function(clunum) {
-      tmp <- kmeans(matrix,clunum,iter.max = 1000)
-      tmp$betweenss/tmp$totss
-    },mc.cores=20)
-    rss <- unlist(rss)
-    x <- 1:maxclunum
-    optclunum <- which.min(sapply(1:maxclunum, function(i) {
-        x2 <- pmax(0, x - i)
-        sum(lm(rss ~ x + x2)$residuals^2)  ## check this
-    }))
-    clu <- kmeans(matrix,optclunum)
-  } else {
-    clu <- kmeans(matrix, number.cluster)    
-  }
-    return(clu)
-}
-
-### determine numPC
-set.seed(12345)
-sdev <- apply(pca, 2, sd)
-x <- 1:20
-optpoint <- which.min(sapply(2:20, function(i) {
-  x2 <- pmax(0, x - i)
-  sum(lm(sdev[1:20] ~ x + x2)$residuals^2)
-}))
-pcadim = optpoint + 1
-pr <- pca[,1:pcadim]  # 2
-
-### mclust
-# mcl <- exprmclust(t(pr),cluster=clu,reduce=F)
-mcl <- exprmclust(t(pr), reduce = F)
-plotmclust(mcl, cell_point_size = 0.1)
-str(mcl)
-## find origin
-pd = data.frame(x = pr[,1], y = pr[,2], clu = as.factor(mcl$clusterid))
-pd = cbind(pd, celltype = ct[match(rownames(pd), ct[,1]),2])
-tab <- table(pd[,3:4])
-tab <- tab/rowSums(tab)
-pd <- melt(tab)
-pd$clu <- factor(as.character(pd$clu), levels = seq(1,max(pd$clu)))
-
-tmp <- pd[pd$celltype == 'HSC', ]
-origin.cluster <- as.numeric(tmp[which.max(tmp[,3]), 1])
-
-## construct pseudotime
-ord <- TSCANorder(mcl, startcluster = origin.cluster, listbranch = T,orderonly = T)
-str(ord)
-length(ord)
-pt <- unlist(sapply(sapply(ord, length), function(i) seq(1, i)))
-names(pt) <- unname(unlist(ord))
-
-
-## plot pseudotime
-pd = data.frame(pc1 = pca[,1], pc2 = pca[,2], time = as.numeric(pt[rownames(pca)]))
-library(scattermore)
-library(RColorBrewer)
-ggplot(data = pd, aes(x = pc1, y = pc2, color = time)) +
-  geom_scattermore() +
-  scale_color_gradient(low = 'yellow', high = 'blue')
-
-# -------------------------------------------------------
-# null distribution of Jaccard index, overlap coefficient
-# -------------------------------------------------------
-js.null <- lapply(seq(1, length(ord)), function(i){
-  b.ori <- ord[[i]]
-  tmp <- sapply(seq(1, 1e3), function(j){
-    set.seed(j)
-    b.pm <- sample(rownames(pr), length(b.ori))
-    length(intersect(b.pm, b.ori))/length(union(b.pm, b.ori))
-  })
-})
-
-par(mfrow = c(1,3))
-hist(js.null[[1]])
-hist(js.null[[2]])
-hist(js.null[[3]])
-
-js.cut <- sapply(js.null, quantile, 0.99)
-
-oc.null <- lapply(seq(1, length(ord)), function(i){
-  b.ori <- ord[[i]]
-  tmp <- sapply(seq(1, 1e3), function(j){
-    set.seed(j)
-    b.pm <- sample(rownames(pr), length(b.ori))
-    length(intersect(b.pm, b.ori))/min(length(b.pm), length(b.ori))
-  })
-})
-par(mfrow = c(1,3))
-hist(oc.null[[1]])
-hist(oc.null[[2]])
-hist(oc.null[[3]])
-
-oc.cut <- sapply(oc.null, quantile, 0.99)
-
-# -----------
-# permutation 
-# -----------
-corrlist <- jslist <- oclist <- list()
-for (pmid in seq(1, n.permute)){
-  ## boostrap cells
-  print(pmid)
-  set.seed(pmid)
-  pr.pm <- pr[sample(rownames(pr), nrow(pr), replace = TRUE),]
-  pr.pm <- pr.pm[!duplicated(rownames(pr.pm)),]
-  
-  # ## cluster cells
-  mcl.pm <- exprmclust(t(pr.pm), reduce = FALSE) ###
-  # plotmclust(mcl.pm, cell_point_size = 0.1)
-  
-  ## select origin cluster
-  pt.pm.mean<- tapply(pt[names(mcl.pm[['clusterid']])], list(mcl.pm[['clusterid']]), mean)
-  start.cluster <- names(which.min(pt.pm.mean))
-  
-  ## construct pseudotime
-  ord.pm <- TSCANorder(mcl.pm, startcluster = start.cluster, listbranch = T,orderonly = T)
-  # str(ord.pm)
-  
-  ## plot pseudotime
-  pt.pm <- unlist(sapply(sapply(ord.pm, length), function(i) seq(1, i)))
-  names(pt.pm) <- unname(unlist(ord.pm))
-  pd = data.frame(pc1 = pca[,1], pc2 = pca[,2], time = as.numeric(pt.pm[rownames(pca)]))
-  # ggplot(data = pd, aes(x = pc1, y = pc2, color = time)) +
-  #   geom_scattermore()
-
-  ## compare two MST
-  js <- sapply(seq(1, length(ord)), function(i){
-          sapply(seq(1, length(ord.pm)), function(j){
-            b.ori <- ord[[i]]
-            b.pm <- ord.pm[[j]]
-            js <- length(intersect(b.pm, b.ori))/length(union(b.pm, b.ori))
-          })
-      })
-  oc <- sapply(seq(1, length(ord)), function(i){
-           sapply(seq(1, length(ord.pm)), function(j){
-              b.ori <- ord[[i]]
-              b.pm <- ord.pm[[j]]
-              oc <- length(intersect(b.pm, b.ori))/min(length(b.pm), length(b.ori))
-           }) 
-        })
-  corr <- sapply(seq(1, length(ord)), function(i){
-           sapply(seq(1, length(ord.pm)), function(j){
-              ov = intersect(ord[[i]], ord.pm[[j]])
-              cor(pt[ov], pt.pm[ov])
-           }) 
-        })
-  corr[is.na(corr)] <- 0
-  colnames(corr) <- colnames(oc) <- colnames(js) <- paste0('original', seq(1, length(ord)))
-  jslist[[pmid]] <- js
-  oclist[[pmid]] <- oc
-  corrlist[[pmid]] <- corr
-}
-saveRDS(jslist, '/Users/wenpinhou/Dropbox/trajectory_variability/tree_variability/result/pm_js.rds')   
-saveRDS(oclist, '/Users/wenpinhou/Dropbox/trajectory_variability/tree_variability/result/pm_oc.rds')   
-
-saveRDS(corrlist, '/Users/wenpinhou/Dropbox/trajectory_variability/tree_variability/result/pm_oc.rds')   
-
-jsm <- do.call(rbind, jslist)
-ocm <- do.call(rbind, oclist)
-par(mfrow = c(1,2))
-hist(jsm)
-hist(ocm)
-
-res <- corr.score <- list()
-for (i in seq(1, length(jslist))){
-  js <- jslist[[i]]
-  js.binary <- sapply(seq(1,ncol(js)), function(c){
-    (js[,c] > js.cut[c]) + 0
-  })
-  while (length(which(rowSums(js.binary) > 1)) > 0 | length(which(colSums(js.binary) > 1)) > 0){
-    dup.id <- which(rowSums(js.binary) > 1)
-    if (length(dup.id) == 1){
-      addid <- which.max(js[dup.id, ])
-      js.binary[dup.id, ] <- 0
-      js.binary[dup.id, addid] <- 1  
-    } else if (length(dup.id) > 1) {
-      for (dup.i in dup.id){
-        addid <- which.max(js[dup.i, ])
-        js.binary[dup.i, ] <- 0
-        js.binary[dup.i, addid] <- 1  
-      }
-    }
-      
-    dup.id <- which(colSums(js.binary) > 1)
-    if (length(dup.id) == 1){
-      addid <- which.max(js[, dup.id])
-      js.binary[dup.id, ] <- 0
-      js.binary[addid, dup.id] <- 1  
-    } else if (length(dup.id) > 1) {
-      for (dup.i in dup.id){
-        addid <- which.max(js[, dup.id])
-        js.binary[, dup.id] <- 0
-        js.binary[addid, dup.id] <- 1  
-      }
-    }
-  }
-  
-  
-  corr.score[[i]] <- corrlist[[i]] * js.binary
-  js.melt <- melt(js.binary)
-  js.melt <- js.melt[js.melt[,3]!=0,]
-  res[[i]] <- as.character(js.melt[,2])
-}
-res <- unlist(res)  
-js.perc <- table(res)/n.permute
-saveRDS(js.perc, '/Users/wenpinhou/Dropbox/trajectory_variability/tree_variability/result/js_percentage.rds')
-
-corr.score.m <- do.call(rbind, corr.score)
-corr.score.v <- colSums(corr.score.m)/n.permute
-saveRDS(corr.score.v, '/Users/wenpinhou/Dropbox/trajectory_variability/tree_variability/result/corr_score.rds')
-
-res <- sapply(seq(1,length(oclist)), function(i){
-  oc <- oclist[[i]]
-  oc.binary <- sapply(seq(1,ncol(oc)), function(c){
-    (oc[,c] > oc.cut[c]) + 0
-  })
-  while (length(which(rowSums(oc.binary) > 1)) > 0 | length(which(colSums(oc.binary) > 1)) > 0){
-    dup.id <- which(rowSums(oc.binary) > 1)
-    if (length(dup.id) == 1){
-      addid <- which.max(oc[dup.id, ])
-      oc.binary[dup.id, ] <- 0
-      oc.binary[dup.id, addid] <- 1  
-    } else if (length(dup.id) > 1) {
-      for (dup.i in dup.id){
-        addid <- which.max(oc[dup.i, ])
-        oc.binary[dup.i, ] <- 0
-        oc.binary[dup.i, addid] <- 1  
-      }
-    }
-    dup.id <- which(colSums(oc.binary) > 1)
-    if (length(dup.id) == 1){
-      addid <- which.max(oc[, dup.id])
-      oc.binary[, dup.id] <- 0
-      oc.binary[addid, dup.id] <- 1  
-    } else if (length(dup.id) > 1) {
-      for (dup.i in dup.id){
-        addid <- which.max(oc[, dup.i])
-        oc.binary[, dup.i] <- 0
-        oc.binary[addid, dup.i] <- 1  
-      }
-    }
-  }
-  oc.melt <- melt(oc.binary)
-  oc.melt <- oc.melt[oc.melt[,3]!=0,]
-  as.character(oc.melt[,2])
-})
-res <- unlist(res)  
-oc.perc <- table(res)/n.permute
-sort((js.perc + oc.perc)/2)
-saveRDS(oc.perc, '/Users/wenpinhou/Dropbox/trajectory_variability/tree_variability/result/oc_percentage.rds')
-
-
diff --git a/hca_bone_marrow_data_analysis/tree_variability/auto_pc_auto_nclu_module/code/02_samples_reproducibility.R b/hca_bone_marrow_data_analysis/tree_variability/auto_pc_auto_nclu_module/code/02_samples_reproducibility.R
deleted file mode 100644
index 82056b3..0000000
--- a/hca_bone_marrow_data_analysis/tree_variability/auto_pc_auto_nclu_module/code/02_samples_reproducibility.R
+++ /dev/null
@@ -1,452 +0,0 @@
-rm(list=ls())
-library(ggplot2)
-library(Seurat)
-library(reshape2)
-library(TSCAN)
-library(scattermore)
-library(RColorBrewer)
-suppressMessages(library(igraph))
-n.permute <- 1e3
-max.clunum <- 50
-setwd("/Users/wenpinhou/Dropbox/trajectory_variability/hca/data/HCA/proc/integrate")
-umap = readRDS('umap.rds')
-pca <- as.matrix(umap@reductions$pca@cell.embeddings)
-ctlevel <- data.frame(ct=c('HSC','MPP','LMPP','CMP','CLP','GMP','MEP',"Bcell","CD4Tcell","CD8Tcell",'NKcell','Mono','Ery'),level=c(1,2,3,3,4,4,4,5,5,5,5,5,5),immunepath=c(1,1,1,0,1,0,0,1,1,1,1,0,0),monopath=c(1,1,1,1,0,1,0,0,0,0,0,1,0),erypath=c(1,1,0,1,0,0,1,0,0,0,0,0,1),stringsAsFactors = F)
-str(pca)
-a = readRDS('/Users/wenpinhou/Dropbox/trajectory_variability/hca/data/HCA/proc/ct/sc.rds')
-ct = data.frame(cell = names(a), celltype = a, stringsAsFactors = FALSE)
-
-mykmeans <- function(matrix, number.cluster = NA){
-  ## cluster the rows
-  set.seed(12345)
-  library(parallel)
-  if (is.na(number.cluster)){
-    maxclunum <- 20
-    rss <- mclapply(1:maxclunum,function(clunum) {
-      tmp <- kmeans(matrix,clunum,iter.max = 1000)
-      tmp$betweenss/tmp$totss
-    },mc.cores=20)
-    rss <- unlist(rss)
-    x <- 1:maxclunum
-    optclunum <- which.min(sapply(1:maxclunum, function(i) {
-        x2 <- pmax(0, x - i)
-        sum(lm(rss ~ x + x2)$residuals^2)  ## check this
-    }))
-    clu <- kmeans(matrix,optclunum)
-  } else {
-    clu <- kmeans(matrix, number.cluster)    
-  }
-    return(clu)
-}
-
-### determine numPC
-set.seed(12345)
-sdev <- apply(pca, 2, sd)
-x <- 1:max.clunum
-optpoint <- which.min(sapply(2:max.clunum, function(i) {
-  x2 <- pmax(0, x - i)
-  sum(lm(sdev[1:max.clunum] ~ x + x2)$residuals^2)
-}))
-pcadim = optpoint + 1
-pr <- pca[,1:pcadim]  # 7
-
-## clustering
-clu <- mykmeans(pr, number.cluster = 14)$cluster
-pd = data.frame(x = pr[,1], y = pr[,2], clu = as.factor(clu[rownames(pr)]))
-
-# mypalette = colorRampPalette(brewer.pal(9,'Set1'))
-# ggplot(data = pd, aes(x = x, y = y, color = clu)) + 
-#   geom_scattermore()+
-#   scale_color_manual(values = mypalette(14))+
-#   theme_classic() + xlab('UMAP1') + ylab('UMAP2')
-
-## cell type composition in clusters
-pd = cbind(pd, celltype = ct[match(rownames(pd), ct[,1]),2])
-tab <- table(pd[,3:4])
-tab <- tab/rowSums(tab)
-pd <- melt(tab)
-pd$clu <- factor(as.character(pd$clu), levels = seq(1,max(pd$clu)))
-
-# ggplot(data = pd) +
-#   geom_bar(aes(x = clu, y = value, fill = celltype), stat = 'identity', position = 'dodge') +
-#   theme_classic() +
-#   ylab('Celltype Proportion') +
-#   scale_fill_manual(values = mypalette(length(unique(pd$celltype))))
-
-### mclust
-mcl <- exprmclust(t(pr),cluster=clu,reduce=F)
-# mcl <- exprmclust(t(pr), reduce = F)
-# plotmclust(mcl, cell_point_size = 0.1)
-# str(mcl)
-
-# --------------------
-# construct pseudotime 
-# --------------------
-## find origin
-pd = data.frame(x = pr[,1], y = pr[,2], clu = as.factor(mcl$clusterid))
-pd = cbind(pd, celltype = ct[match(rownames(pd), ct[,1]),2])
-tab <- table(pd[,3:4])
-tab <- tab/rowSums(tab)
-pd <- melt(tab)
-pd$clu <- factor(as.character(pd$clu), levels = seq(1,max(pd$clu)))
-tmp <- pd[pd$celltype == 'HSC', ]
-origin.cluster <- as.numeric(tmp[which.max(tmp[,3]), 1])
-
-## construct pseudotime
-ord <- TSCANorder(mcl, startcluster = origin.cluster, listbranch = T,orderonly = T)
-str(ord)
-length(ord)
-pt <- unlist(sapply(sapply(ord, length), function(i) seq(1, i)))
-names(pt) <- unname(unlist(ord))
-
-# ## plot pseudotime
-# pd = data.frame(pc1 = pca[,1], pc2 = pca[,2], time = as.numeric(pt[rownames(pca)]))
-# library(scattermore)
-# library(RColorBrewer)
-# ggplot(data = pd, aes(x = pc1, y = pc2, color = time)) +
-#   geom_scattermore() +
-#   scale_color_gradient(low = 'yellow', high = 'blue')
-
-# ------------------------------------------------------------
-# get candidate branches to test reproducibility, 20200726 >>
-# ------------------------------------------------------------
-findbranch <- function(mst, order, origin){
-  deg <- degree(mst)
-  vertex <- names(deg[which(deg > 2 | deg == 1)])
-  if (!origin %in% vertex) vertex <- c(origin, vertex)
-  eg <- expand.grid(1:length(vertex), 1:length(vertex))
-  eg <- eg[eg[,1]<eg[,2],]
-  eg = data.frame(vertex[eg[,1]], vertex[eg[,2]], stringsAsFactors = FALSE)
-  library(igraph)
-  tmpbranch <- lapply(seq(1,nrow(eg)), function(i){
-    sp <- shortest_paths(mst, from = eg[i,1], to = eg[i,2])$vpath[[1]]
-    if (sum(vertex %in% sp) == 2) as.vector(sp)
-  })
-  tmpbranch <- tmpbranch[sapply(tmpbranch, length) >0]  
- 
-  allbranch <- gsub('backbone ', '', gsub('branch: ', '', names(order)))
-  allbranch <- sapply(allbranch, function(i) strsplit(i, ',')[[1]])
-  allbranch <- paste0(names(allbranch), collapse = ' ')
-  newbranch <-sapply(tmpbranch, function(i) {
-      tmp <- paste0(i, collapse = ',')
-      if (!grepl(tmp, allbranch)){
-        rev(i)
-      } else {
-        i
-      }
-  })
-  return(newbranch)
-}
-newbranch <- findbranch(mst = mcl$MSTtree, order = ord, origin = origin.cluster)  
-
-
-# -------------------------------------------------------
-# null distribution of Jaccard index, overlap coefficient
-# -------------------------------------------------------
-## add here --------------->>>>>>
-## for samples
-## add here ---------------<<<<<<<
-js.null <- lapply(seq(1, length(newbranch)), function(i){
-  b.ori <- unlist(sapply(newbranch[[i]], function(c) names(mcl$clusterid[mcl$clusterid == c])))
-  b.ori.alls <- gsub(':.*', '', b.ori)
-  alls <- gsub(':.*', '', rownames(pr))
-  tmp <- mclapply(seq(1, 1e3), function(j){
-    set.seed(j)
-    b.pm <- sample(rownames(pr), length(b.ori))
-    b.pm.alls <- gsub(':.*', '', b.pm)
-    tmpp <- sapply(unique(alls), function(s){
-      b.pm.s <- b.pm[b.pm.alls == s]
-      b.ori.s <- b.ori[b.ori.alls == s]
-      length(intersect(b.pm.s, b.ori.s))/length(union(b.pm.s, b.ori.s))
-    })  
-  },mc.cores = detectCores()-2)
-  tmp <- do.call(rbind,tmp)
-})
-js.cut <- sapply(js.null, function(i) apply(i, 2, quantile, 0.99))
-# ------------------
-
-oc.null <- lapply(seq(1, length(newbranch)), function(i){
-  b.ori <- unlist(sapply(newbranch[[i]], function(c) names(mcl$clusterid[mcl$clusterid == c])))
-  b.ori.alls <- gsub(':.*', '', b.ori)
-  tmp <- mclapply(seq(1, 1e3), function(j){
-    set.seed(j)
-    b.pm <- sample(rownames(pr), length(b.ori))
-    b.pm.alls <- gsub(':.*', '', b.pm)
-    tmpp <- sapply(unique(alls), function(s){
-      b.pm.s <- b.pm[b.pm.alls == s]
-      b.ori.s <- b.ori[b.ori.alls == s]
-      length(intersect(b.pm.s, b.ori.s))/min(length(b.pm.s), length(b.ori.s))
-    })  
-  },mc.cores = detectCores()-2)
-  tmp <- do.call(rbind,tmp)
-})
-oc.cut <- sapply(oc.null, function(i) apply(i, 2, quantile, 0.99))
-
-# -----------
-# permutation 
-# -----------
-corrlist.alls <- jslist.alls <- oclist.alls <- list()
-n.permute = 100
-for (pmid in seq(1, n.permute)){
-  ## boostrap cells
-  print(pmid)
-  set.seed(pmid)
-  pr.pm <- pr[sample(rownames(pr), nrow(pr), replace = TRUE),]
-  pr.pm <- pr.pm[!duplicated(rownames(pr.pm)),]
-  
-  # ## cluster cells
-  clu <- mykmeans(pr.pm, number.cluster = 14)$cluster ###
-
-  pd = data.frame(x = pr[names(clu),1], y = pr[names(clu),2], clu = as.factor(clu))
-  pd.text.x = tapply(pd[,1], list(pd$clu), mean)
-  pd.text.y = tapply(pd[,2], list(pd$clu), mean)
-  pd.text = data.frame(x = pd.text.x, y = pd.text.y, clu = names(pd.text.x))
-  pd.text[14,1:2] =  c(pd.text[14,1] + 2, pd.text[14,2] + 1)
-
-  # ggplot() + 
-  #   geom_scattermore(data = pd, aes(x = x, y = y, color = clu))+
-  #   scale_color_manual(values = mypalette(14))+
-  #   theme_classic() + xlab('UMAP1') + ylab('UMAP2') +
-  #   geom_text(data = pd.text, aes(x = x, y = y, label = clu))
-  
-
-  ## cell type composition in clusters
-  # pd = cbind(pd, celltype = ct[match(rownames(pd), ct[,1]),2])
-  # tab <- table(pd[,3:4])
-  # tab <- tab/rowSums(tab)
-  # pd <- melt(tab)
-  # pd$clu <- factor(as.character(pd$clu), levels = seq(1,max(pd$clu)))
-  # ggplot(data = pd) +
-  #   geom_bar(aes(x = clu, y = value, fill = celltype), stat = 'identity', position = 'dodge') +
-  #   theme_classic() +
-  #   ylab('Celltype Proportion') +
-  #   scale_fill_manual(values = mypalette(length(unique(pd$celltype))))
-
-  # build pseudotime
-  mcl.pm <- exprmclust(t(pr.pm), cluster = clu, reduce = FALSE) ###
-  # plotmclust(mcl.pm, cell_point_size = 0.1)
-  
-  ## select origin cluster
-  pt.pm.mean<- tapply(pt[names(mcl.pm[['clusterid']])], list(mcl.pm[['clusterid']]), mean)
-  start.cluster <- names(which.min(pt.pm.mean))
-  
-  ## construct pseudotime
-  ord.pm <- TSCANorder(mcl.pm, startcluster = start.cluster, listbranch = T,orderonly = T)
-  # str(ord.pm)
-  
-  ## plot pseudotime
-  pt.pm <- unlist(sapply(sapply(ord.pm, length), function(i) seq(1, i)))
-  names(pt.pm) <- unname(unlist(ord.pm))
-  pd = data.frame(pc1 = pca[,1], pc2 = pca[,2], time = as.numeric(pt.pm[rownames(pca)]))
-  # ggplot(data = pd, aes(x = pc1, y = pc2, color = time)) +
-  #   geom_scattermore() + theme_classic()
-  
-  # get candidate branches
-  newbranch.pm <- findbranch(mst = mcl.pm$MSTtree, order = ord.pm, origin = start.cluster)
-  
-  ## compare two MST
-  js <- sapply(seq(1, length(newbranch)), function(i){
-          print('i')
-          print(i)
-          id <- which(sapply(paste0(names(ord),','), function(k) grepl(paste0(paste0(newbranch[[i]], collapse = ','),','), k)))[1]
-          cells <- ord[[id]]
-          b.ori <- intersect(unlist(sapply(newbranch[[i]], function(k) names(mcl$clusterid)[mcl$clusterid == k])), cells)
-          b.ori.alls <- gsub(':.*', '', b.ori)
-          tmp <- mclapply(seq(1, length(newbranch.pm)), function(j){
-            print(j)
-            id <- which(sapply(paste0(names(ord.pm),','), function(k) grepl(paste0(paste0(newbranch.pm[[j]], collapse = ','),','), k)))[1]
-            cells <- ord.pm[[id]]
-            b.pm <- intersect(unlist(sapply(newbranch.pm[[j]], function(k) names(mcl.pm$clusterid)[mcl.pm$clusterid == k])), cells)
-            b.pm.alls <- gsub(':.*', '', b.pm)
-            # js <- length(intersect(b.pm, b.ori))/length(union(b.pm, b.ori))
-            tmpp <- sapply(unique(alls), function(s){
-              b.pm.s <- b.pm[b.pm.alls == s]
-              b.ori.s <- b.ori[b.ori.alls == s]
-              length(intersect(b.pm.s, b.ori.s))/length(union(b.pm.s, b.ori.s))
-            })  
-          },mc.cores = detectCores()-2)
-          tmp <- do.call(rbind,tmp)
-          rownames(tmp) <- paste0('branch.pm', seq(1, length(newbranch.pm)))
-          tmp
-        }, simplify = FALSE)
-  names(js) <- paste0('branch', seq(1, length(newbranch)))
-  ###### =====================================
-  oc <- sapply(seq(1, length(newbranch)), function(i){
-            id <- which(sapply(paste0(names(ord),','), function(k) grepl(paste0(paste0(newbranch[[i]], collapse = ','),','), k)))[1]
-            cells <- ord[[id]]
-            b.ori <- intersect(unlist(sapply(newbranch[[i]], function(k) names(mcl$clusterid)[mcl$clusterid == k])), cells)
-            b.ori.alls <- gsub(':.*', '', b.ori)
-            tmp <- mclapply(seq(1, length(newbranch.pm)), function(j){
-                id <- which(sapply(paste0(names(ord.pm),','), function(k) grepl(paste0(paste0(newbranch.pm[[j]], collapse = ','),','), k)))[1]
-                cells <- ord.pm[[id]]
-                b.pm <- intersect(unlist(sapply(newbranch.pm[[j]], function(k) names(mcl.pm$clusterid)[mcl.pm$clusterid == k])), cells)
-                b.pm.alls <- gsub(':.*', '', b.pm)
-                # oc <- length(intersect(b.pm, b.ori))/min(length(b.pm), length(b.ori))
-                tmpp <- sapply(unique(alls), function(s){
-                  b.pm.s <- b.pm[b.pm.alls == s]
-                  b.ori.s <- b.ori[b.ori.alls == s]
-                  length(intersect(b.pm.s, b.ori.s))/min(length(b.pm.s), length(b.ori.s))
-                })  
-          },mc.cores = detectCores()-2)
-          tmp <- do.call(rbind,tmp)
-          rownames(tmp) <- paste0('branch.pm', seq(1, length(newbranch.pm)))
-          tmp
-        }, simplify = FALSE)
-  names(oc) <- paste0('branch', seq(1, length(newbranch)))           
-        
-  
-  corr <- sapply(seq(1, length(newbranch)), function(i){
-              id <- which(sapply(paste0(names(ord),','), function(k) grepl(paste0(paste0(newbranch[[i]], collapse = ','),','), k)))[1]
-              cells <- ord[[id]]
-              b.ori <- intersect(unlist(sapply(newbranch[[i]], function(k) names(mcl$clusterid)[mcl$clusterid == k])), cells)
-              b.ori.alls <- gsub(':.*', '', b.ori)
-              tmp <- mclapply(seq(1, length(newbranch.pm)), function(j){
-                  id <- which(sapply(paste0(names(ord.pm),','), function(k) grepl(paste0(paste0(newbranch.pm[[j]], collapse = ','),','), k)))[1]
-                  cells <- ord.pm[[id]]
-                  b.pm <- intersect(unlist(sapply(newbranch.pm[[j]], function(k) names(mcl.pm$clusterid)[mcl.pm$clusterid == k])), cells)
-                  b.pm.alls <- gsub(':.*', '', b.pm)
-                  # ov = intersect(b.ori, b.pm)
-                  # cor(pt[ov], pt.pm[ov])
-                  tmpp <- sapply(unique(alls), function(s){
-                    b.pm.s <- b.pm[b.pm.alls == s]
-                    b.ori.s <- b.ori[b.ori.alls == s] 
-                    ov = intersect(b.ori.s, b.pm.s)
-                    cor(pt[ov], pt.pm[ov])
-                  })
-              }, mc.cores = detectCores()-2) 
-              tmp <- do.call(rbind, tmp)
-              rownames(tmp) <- paste0('branch.pm', seq(1, length(newbranch.pm)))
-              tmp[is.na(tmp)] <- 0
-              tmp
-          }, simplify = FALSE)
-  # corr[is.na(corr)] <- 0
-  names(corr) <- paste0('branch', seq(1, length(newbranch)))           
-  # colnames(corr) <- colnames(oc) <- colnames(js) <- paste0('original', seq(1, length(newbranch)))
-  jslist.alls[[pmid]] <- js
-  oclist.alls[[pmid]] <- oc
-  corrlist.alls[[pmid]] <- corr
-}
-saveRDS(jslist.alls, '/Users/wenpinhou/Dropbox/trajectory_variability/tree_variability/result/samples/pm_js_alls.rds')   
-saveRDS(oclist.alls, '/Users/wenpinhou/Dropbox/trajectory_variability/tree_variability/result/samples/pm_oc_alls.rds')   
-saveRDS(corrlist.alls, '/Users/wenpinhou/Dropbox/trajectory_variability/tree_variability/samples/result/pm_oc_alls.rds')   
-
-# jsm <- do.call(rbind, jslist)
-# ocm <- do.call(rbind, oclist)
-# par(mfrow = c(1,2))
-# hist(jsm)
-# hist(ocm)
-s = unique(alls)[1]
-df.alls <- lapply(unique(alls), function(s){
-  jslist = sapply(jslist.alls, function(i){
-    sapply(i, function(ii) ii[,s])
-  }, simplify = FALSE)
-  oclist = sapply(oclist.alls, function(i){
-    sapply(i, function(ii) ii[,s])
-  }, simplify = FALSE)
-  corrlist = sapply(corrlist.alls, function(i){
-    sapply(i, function(ii) ii[,s])
-  }, simplify = FALSE)
-  
-  res <- corr.score <- list()
-  for (i in seq(1, length(jslist))){
-    print(i)
-    js <- jslist[[i]]
-    js.binary <- sapply(seq(1,ncol(js)), function(c){
-      (js[,c] > js.cut[c]) + 0
-    })
-    while (length(which(rowSums(js.binary) > 1)) > 0 | length(which(colSums(js.binary) > 1)) > 0){
-      dup.id <- which(rowSums(js.binary) > 1)
-      if (length(dup.id) == 1){
-        addid <- which.max(js[dup.id, ])
-        js.binary[dup.id, ] <- 0
-        js.binary[dup.id, addid] <- 1  
-      } else if (length(dup.id) > 1) {
-        for (dup.i in dup.id){
-          print(dup.i)
-          addid <- which.max(js[dup.i, ])
-          js.binary[dup.i, ] <- 0
-          js.binary[dup.i, addid] <- 1  
-        }
-      }
-        
-      dup.id <- which(colSums(js.binary) > 1)
-      if (length(dup.id) == 1){
-        addid <- which.max(js[, dup.id])
-        js.binary[, dup.id] <- 0
-        js.binary[addid, dup.id] <- 1  
-      } else if (length(dup.id) > 1) {
-        for (dup.i in dup.id){
-          addid <- which.max(js[, dup.i])
-          js.binary[, dup.i] <- 0
-          js.binary[addid, dup.i] <- 1  
-        }
-      }
-    }
-    
-    corr.score[[i]] <- corrlist[[i]] * js.binary
-    js.melt <- melt(js.binary)
-    js.melt <- js.melt[js.melt[,3]!=0,]
-    res[[i]] <- as.character(js.melt[,2])
-  }
-  res <- unlist(res)  
-  js.perc <- table(res)/n.permute
-  names(js.perc) <- newbranch
-  # saveRDS(js.perc, '/Users/wenpinhou/Dropbox/trajectory_variability/tree_variability/result/samples/js_percentage.rds')
-  
-  corr.score.m <- do.call(rbind, corr.score)
-  corr.score.v <- colSums(corr.score.m)/n.permute
-  names(corr.score.v) <- newbranch
-  # saveRDS(corr.score.v, '/Users/wenpinhou/Dropbox/trajectory_variability/tree_variability/result/samples/corr_score.rds')
-  
-  res <- sapply(seq(1,length(oclist)), function(i){
-    print(i)
-    oc <- oclist[[i]]
-    oc.binary <- sapply(seq(1,ncol(oc)), function(c){
-      (oc[,c] > oc.cut[c]) + 0
-    })
-    while (length(which(rowSums(oc.binary) > 1)) > 0 | length(which(colSums(oc.binary) > 1)) > 0){
-      dup.id <- which(rowSums(oc.binary) > 1)
-      if (length(dup.id) == 1){
-        addid <- which.max(oc[dup.id, ])
-        oc.binary[dup.id, ] <- 0
-        oc.binary[dup.id, addid] <- 1  
-      } else if (length(dup.id) > 1) {
-        for (dup.i in dup.id){
-          addid <- which.max(oc[dup.i, ])
-          oc.binary[dup.i, ] <- 0
-          oc.binary[dup.i, addid] <- 1  
-        }
-      }
-      dup.id <- which(colSums(oc.binary) > 1)
-      if (length(dup.id) == 1){
-        addid <- which.max(oc[, dup.id])
-        oc.binary[, dup.id] <- 0
-        oc.binary[addid, dup.id] <- 1  
-      } else if (length(dup.id) > 1) {
-        for (dup.i in dup.id){
-          addid <- which.max(oc[, dup.i])
-          oc.binary[, dup.i] <- 0
-          oc.binary[addid, dup.i] <- 1  
-        }
-      }
-    }
-    oc.melt <- melt(oc.binary)
-    oc.melt <- oc.melt[oc.melt[,3]!=0,]
-    as.character(oc.melt[,2])
-  })
-  res <- unlist(res)  
-  oc.perc <- table(res)/n.permute
-  names(oc.perc) <- newbranch
-  sort((js.perc + oc.perc)/2)
-  
-  df <- data.frame(js.perc = js.perc, oc.perc = oc.perc, corr.score.v = corr.score.v)
-  df <- df[, c(2,4,5)]
-
-# saveRDS(oc.perc, '/Users/wenpinhou/Dropbox/trajectory_variability/tree_variability/result/samples/oc_percentage.rds')
-  
-})
-names(df.alls) <- unique(alls)
-df.alls[order(names(df.alls))]
-
-
-
diff --git a/hca_bone_marrow_data_analysis/tree_variability/auto_pc_auto_nclu_module/code/03_try_to_build_module.R b/hca_bone_marrow_data_analysis/tree_variability/auto_pc_auto_nclu_module/code/03_try_to_build_module.R
deleted file mode 100644
index c822909..0000000
--- a/hca_bone_marrow_data_analysis/tree_variability/auto_pc_auto_nclu_module/code/03_try_to_build_module.R
+++ /dev/null
@@ -1,443 +0,0 @@
-rm(list=ls())
-library(ggplot2)
-library(Seurat)
-library(reshape2)
-library(TSCAN)
-library(scattermore)
-library(RColorBrewer)
-suppressMessages(library(igraph))
-n.permute <- 3
-max.clunum <- 50
-setwd("/Users/wenpinhou/Dropbox/trajectory_variability/hca/data/HCA/proc/integrate")
-
-# --------------------------------------------------------------
-# input: seurat integrated object including:
-#  umap, pca
-# celltype: a dataframe, col 1 is cell name, col 2 is cell type
-# origin: the origin cell type
-# --------------------------------------------------------------
-# read in data
-umap = readRDS('umap.rds')
-pca <- as.matrix(umap@reductions$pca@cell.embeddings)
-# ctlevel <- data.frame(ct=c('HSC','MPP','LMPP','CMP','CLP','GMP','MEP',"Bcell","CD4Tcell","CD8Tcell",'NKcell','Mono','Ery'),level=c(1,2,3,3,4,4,4,5,5,5,5,5,5),immunepath=c(1,1,1,0,1,0,0,1,1,1,1,0,0),monopath=c(1,1,1,1,0,1,0,0,0,0,0,1,0),erypath=c(1,1,0,1,0,0,1,0,0,0,0,0,1),stringsAsFactors = F)
-str(pca)
-a = readRDS('/Users/wenpinhou/Dropbox/trajectory_variability/hca/data/HCA/proc/ct/sc.rds')
-ct = data.frame(cell = names(a), celltype = a, stringsAsFactors = FALSE)
-alls <- sub(':.*', '', names(a))
-names(alls) <- names(a)
-  
-mykmeans <- function(matrix, number.cluster = NA){
-  ## cluster the rows
-  set.seed(12345)
-  library(parallel)
-  if (is.na(number.cluster)){
-    maxclunum <- 20
-    rss <- mclapply(1:maxclunum,function(clunum) {
-      tmp <- kmeans(matrix,clunum,iter.max = 1000)
-      tmp$betweenss/tmp$totss
-    },mc.cores=20)
-    rss <- unlist(rss)
-    x <- 1:maxclunum
-    optclunum <- which.min(sapply(1:maxclunum, function(i) {
-        x2 <- pmax(0, x - i)
-        sum(lm(rss ~ x + x2)$residuals^2)  ## check this
-    }))
-    clu <- kmeans(matrix,optclunum)
-  } else {
-    clu <- kmeans(matrix, number.cluster)    
-  }
-    return(clu)
-}
-
-### determine numPC
-set.seed(12345)
-sdev <- apply(pca, 2, sd)
-x <- 1:max.clunum
-optpoint <- which.min(sapply(2:max.clunum, function(i) {
-  x2 <- pmax(0, x - i)
-  sum(lm(sdev[1:max.clunum] ~ x + x2)$residuals^2)
-}))
-pcadim = optpoint + 1
-pr <- pca[,1:pcadim]  # 7
-
-## clustering
-clu <- mykmeans(pr, number.cluster = 14)$cluster
-# pd = data.frame(x = pr[,1], y = pr[,2], clu = as.factor(clu[rownames(pr)]))
-# mypalette = colorRampPalette(brewer.pal(9,'Set1'))
-# ggplot(data = pd, aes(x = x, y = y, color = clu)) + 
-#   geom_scattermore()+
-#   scale_color_manual(values = mypalette(14))+
-#   theme_classic() + xlab('UMAP1') + ylab('UMAP2')
-
-# ## cell type composition in clusters
-# pd = cbind(pd, celltype = ct[match(rownames(pd), ct[,1]),2])
-# tab <- table(pd[,3:4])
-# tab <- tab/rowSums(tab)
-# pd <- melt(tab)
-# pd$clu <- factor(as.character(pd$clu), levels = seq(1,max(pd$clu)))
-# 
-# ggplot(data = pd) +
-#   geom_bar(aes(x = clu, y = value, fill = celltype), stat = 'identity', position = 'dodge') +
-#   theme_classic() +
-#   ylab('Celltype Proportion') +
-#   scale_fill_manual(values = mypalette(length(unique(pd$celltype))))
-
-### mclust
-mcl <- exprmclust(t(pr),cluster=clu,reduce=F)
-# mcl <- exprmclust(t(pr), reduce = F)
-# plotmclust(mcl, cell_point_size = 0.1)
-# str(mcl)
-
-# --------------------
-# construct pseudotime 
-# --------------------
-## find origin
-pd = data.frame(x = pr[,1], y = pr[,2], clu = as.factor(mcl$clusterid))
-pd = cbind(pd, celltype = ct[match(rownames(pd), ct[,1]),2])
-tab <- table(pd[,3:4])
-tab <- tab/rowSums(tab)
-pd <- melt(tab)
-pd$clu <- factor(as.character(pd$clu), levels = seq(1,max(pd$clu)))
-tmp <- pd[pd$celltype == 'HSC', ]
-origin.cluster <- as.numeric(tmp[which.max(tmp[,3]), 1])
-
-## construct pseudotime
-ord <- TSCANorder(mcl, startcluster = origin.cluster, listbranch = T,orderonly = T)
-str(ord)
-length(ord)
-pt <- unlist(sapply(sapply(ord, length), function(i) seq(1, i)))
-names(pt) <- unname(unlist(ord))
-
-# ## plot pseudotime
-# pd = data.frame(pc1 = pca[,1], pc2 = pca[,2], time = as.numeric(pt[rownames(pca)]))
-# library(scattermore)
-# library(RColorBrewer)
-# ggplot(data = pd, aes(x = pc1, y = pc2, color = time)) +
-#   geom_scattermore() +
-#   scale_color_gradient(low = 'yellow', high = 'blue')
-
-# ------------------------------------------------------------
-# get candidate branches to test reproducibility, 20200726 >>
-# ------------------------------------------------------------
-findbranch <- function(mst, order, origin){
-  deg <- degree(mst)
-  vertex <- names(deg[which(deg > 2 | deg == 1)])
-  if (!origin %in% vertex) vertex <- c(origin, vertex)
-  eg <- expand.grid(1:length(vertex), 1:length(vertex))
-  eg <- eg[eg[,1]<eg[,2],]
-  eg = data.frame(vertex[eg[,1]], vertex[eg[,2]], stringsAsFactors = FALSE)
-  library(igraph)
-  tmpbranch <- lapply(seq(1,nrow(eg)), function(i){
-    sp <- shortest_paths(mst, from = eg[i,1], to = eg[i,2])$vpath[[1]]
-    if (sum(vertex %in% sp) == 2) as.vector(sp)
-  })
-  tmpbranch <- tmpbranch[sapply(tmpbranch, length) >0]  
- 
-  allbranch <- gsub('backbone ', '', gsub('branch: ', '', names(order)))
-  allbranch <- sapply(allbranch, function(i) strsplit(i, ',')[[1]])
-  allbranch <- paste0(names(allbranch), collapse = ' ')
-  newbranch <-sapply(tmpbranch, function(i) {
-      tmp <- paste0(i, collapse = ',')
-      if (!grepl(tmp, allbranch)){
-        rev(i)
-      } else {
-        i
-      }
-  })
-  return(newbranch)
-}
-newbranch <- findbranch(mst = mcl$MSTtree, order = ord, origin = origin.cluster)  
-
-# -----------------------------------------------------
-# Evaluate robustness of tree branches using resampling
-# -----------------------------------------------------
-
-# null distribution of Jaccard index, overlap coefficient
-
-js.null <- lapply(seq(1, length(newbranch)), function(i) {
-  b.ori <- unlist(sapply(newbranch[[i]], function(c) names(mcl$clusterid[mcl$clusterid == c])))
-  tmp <- sapply(seq(1, 1e3), function(j){
-    set.seed(j)
-    b.pm <- sample(rownames(pr), length(b.ori))
-    length(intersect(b.pm, b.ori))/length(union(b.pm, b.ori))
-  })
-})
-
-# par(mfrow = c(2,ceiling(length(js.null)/2)))
-# for (i in js.null) hist(i, xlab = 'js', main = '', breaks = 50)
-
-js.cut <- sapply(js.null, quantile, 0.99)
-
-oc.null <- lapply(seq(1, length(newbranch)), function(i){
-  b.ori <- unlist(sapply(newbranch[[i]], function(c) names(mcl$clusterid[mcl$clusterid == c])))
-  tmp <- sapply(seq(1, 1e3), function(j){
-    set.seed(j)
-    b.pm <- sample(rownames(pr), length(b.ori))
-    length(intersect(b.pm, b.ori))/min(length(b.pm), length(b.ori))
-  })
-})
-# par(mfrow = c(2,ceiling(length(oc.null)/2)))
-# for (i in oc.null) hist(i, xlab = 'oc', main = '', breaks = 50)
-oc.cut <- sapply(oc.null, quantile, 0.99)
-
-# permutation 
-
-get_binary <- function(js){
-  js.binary <- sapply(seq(1,ncol(js)), function(c){
-    (js[,c] > js.cut[c]) + 0
-  })
-  while (length(which(rowSums(js.binary) > 1)) > 0 | length(which(colSums(js.binary) > 1)) > 0){
-    dup.id <- which(rowSums(js.binary) > 1)
-    if (length(dup.id) == 1){
-      addid <- which.max(js[dup.id, ])
-      js.binary[dup.id, ] <- 0
-      js.binary[dup.id, addid] <- 1  
-    } else if (length(dup.id) > 1) {
-      for (dup.i in dup.id){
-        addid <- which.max(js[dup.i, ])
-        js.binary[dup.i, ] <- 0
-        js.binary[dup.i, addid] <- 1  
-      }
-    }
-      
-    dup.id <- which(colSums(js.binary) > 1)
-    if (length(dup.id) == 1){
-      addid <- which.max(js[, dup.id])
-      js.binary[, dup.id] <- 0
-      js.binary[addid, dup.id] <- 1  
-    } else if (length(dup.id) > 1) {
-      for (dup.i in dup.id){
-        addid <- which.max(js[, dup.i])
-        js.binary[, dup.i] <- 0
-        js.binary[addid, dup.i] <- 1  
-      }
-    }
-  }
-  return(js.binary)
-}
-
-ctcomplist <- reproduce <- corr.score <- corrlist <- jslist <- oclist <- list()
-
-for (pmid in seq(1, n.permute)){
-  ## boostrap cells
-  print(pmid)
-  set.seed(pmid)
-  pr.pm <- pr[sample(rownames(pr), nrow(pr), replace = TRUE),]
-  pr.pm <- pr.pm[!duplicated(rownames(pr.pm)),]
-  
-  ## cluster cells
-  clu <- mykmeans(pr.pm, number.cluster = 14)$cluster ###
-
-  # --- check if these codes are necessary <<<<<<<<<<<<<<<<
-  pd = data.frame(x = pr[names(clu),1], y = pr[names(clu),2], clu = as.factor(clu))
-  pd.text.x = tapply(pd[,1], list(pd$clu), mean)
-  pd.text.y = tapply(pd[,2], list(pd$clu), mean)
-  pd.text = data.frame(x = pd.text.x, y = pd.text.y, clu = names(pd.text.x))
-  pd.text[14,1:2] =  c(pd.text[14,1] + 2, pd.text[14,2] + 1)
-  # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>
-  # ggplot() + 
-  #   geom_scattermore(data = pd, aes(x = x, y = y, color = clu))+
-  #   scale_color_manual(values = mypalette(14))+
-  #   theme_classic() + xlab('UMAP1') + ylab('UMAP2') +
-  #   geom_text(data = pd.text, aes(x = x, y = y, label = clu))
-  
-
-  ## cell type composition in clusters
-  # pd = cbind(pd, celltype = ct[match(rownames(pd), ct[,1]),2])
-  # tab <- table(pd[,3:4])
-  # tab <- tab/rowSums(tab)
-  # pd <- melt(tab)
-  # pd$clu <- factor(as.character(pd$clu), levels = seq(1,max(pd$clu)))
-  # ggplot(data = pd) +
-  #   geom_bar(aes(x = clu, y = value, fill = celltype), stat = 'identity', position = 'dodge') +
-  #   theme_classic() +
-  #   ylab('Celltype Proportion') +
-  #   scale_fill_manual(values = mypalette(length(unique(pd$celltype))))
-
-  ## build pseudotime
-  mcl.pm <- exprmclust(t(pr.pm), cluster = clu, reduce = FALSE) ###
-  # plotmclust(mcl.pm, cell_point_size = 0.1)
-  
-  ## select origin cluster
-  pt.pm.mean<- tapply(pt[names(mcl.pm[['clusterid']])], list(mcl.pm[['clusterid']]), mean)
-  start.cluster <- names(which.min(pt.pm.mean))
-  
-  ## construct pseudotime
-  ord.pm <- TSCANorder(mcl.pm, startcluster = start.cluster, listbranch = T,orderonly = T)
-  # str(ord.pm)
-  
-  pt.pm <- unlist(sapply(sapply(ord.pm, length), function(i) seq(1, i)))
-  names(pt.pm) <- unname(unlist(ord.pm))
-  # --- check if these codes are necessary <<<<<<<<<<<<<<<<
-  ## plot pseudotime
-  
-  pd = data.frame(pc1 = pca[,1], pc2 = pca[,2], time = as.numeric(pt.pm[rownames(pca)]))
-  # ggplot(data = pd, aes(x = pc1, y = pc2, color = time)) +
-  #   geom_scattermore() + theme_classic()
-  # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>
-  
-  # get candidate branches
-  newbranch.pm <- findbranch(mst = mcl.pm$MSTtree, order = ord.pm, origin = start.cluster)
-  
-  ## compare two MST
-  js <- sapply(seq(1, length(newbranch)), function(i){
-          id <- which(sapply(paste0(names(ord),','), function(k) grepl(paste0(paste0(newbranch[[i]], collapse = ','),','), k)))[1]
-          cells <- ord[[id]]
-          b.ori <- intersect(unlist(sapply(newbranch[[i]], function(k) names(mcl$clusterid)[mcl$clusterid == k])), cells)
-          sapply(seq(1, length(newbranch.pm)), function(j){
-            print(j)
-            id <- which(sapply(paste0(names(ord.pm),','), function(k) grepl(paste0(paste0(newbranch.pm[[j]], collapse = ','),','), k)))[1]
-            cells <- ord.pm[[id]]
-            b.pm <- intersect(unlist(sapply(newbranch.pm[[j]], function(k) names(mcl.pm$clusterid)[mcl.pm$clusterid == k])), cells)
-            js <- length(intersect(b.pm, b.ori))/length(union(b.pm, b.ori))
-          })
-        })
-  oc <- sapply(seq(1, length(newbranch)), function(i){
-            id <- which(sapply(paste0(names(ord),','), function(k) grepl(paste0(paste0(newbranch[[i]], collapse = ','),','), k)))[1]
-            cells <- ord[[id]]
-            b.ori <- intersect(unlist(sapply(newbranch[[i]], function(k) names(mcl$clusterid)[mcl$clusterid == k])), cells)
-            sapply(seq(1, length(newbranch.pm)), function(j){
-                id <- which(sapply(paste0(names(ord.pm),','), function(k) grepl(paste0(paste0(newbranch.pm[[j]], collapse = ','),','), k)))[1]
-                cells <- ord.pm[[id]]
-                b.pm <- intersect(unlist(sapply(newbranch.pm[[j]], function(k) names(mcl.pm$clusterid)[mcl.pm$clusterid == k])), cells)
-                oc <- length(intersect(b.pm, b.ori))/min(length(b.pm), length(b.ori))
-           }) 
-        })
-  corr <- sapply(seq(1, length(newbranch)), function(i){
-              id <- which(sapply(paste0(names(ord),','), function(k) grepl(paste0(paste0(newbranch[[i]], collapse = ','),','), k)))[1]
-              cells <- ord[[id]]
-              b.ori <- intersect(unlist(sapply(newbranch[[i]], function(k) names(mcl$clusterid)[mcl$clusterid == k])), cells)
-              
-              sapply(seq(1, length(newbranch.pm)), function(j){
-                  id <- which(sapply(paste0(names(ord.pm),','), function(k) grepl(paste0(paste0(newbranch.pm[[j]], collapse = ','),','), k)))[1]
-                  cells <- ord.pm[[id]]
-                  b.pm <- intersect(unlist(sapply(newbranch.pm[[j]], function(k) names(mcl.pm$clusterid)[mcl.pm$clusterid == k])), cells)
-                  ov = intersect(b.ori, b.pm)
-                  cor(pt[ov], pt.pm[ov])
-              }) 
-          })
-  corr[is.na(corr)] <- 0
-  colnames(corr) <- colnames(oc) <- colnames(js) <- paste0('original', seq(1, length(newbranch)))
-  jslist[[pmid]] <- js
-  oclist[[pmid]] <- oc
-  corrlist[[pmid]] <- corr
-  
-  ## get js binary to matched branches <<<<<<<<<<<<<<< 
-  js.binary <- get_binary(js)
-  corr.score[[pmid]] <- corr * js.binary
-  js.melt <- melt(js.binary)
-  js.melt <- js.melt[js.melt[,3]!=0,]
-  colnames(js.melt) <- c('permutation.branch','original.branch','matched')
-  reproduce[[pmid]] <- as.character(js.melt[,2])
-  ## >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>
-  ## 
-  tmp <- js.melt[1,2]
-  ctcomp <- sapply(js.melt[,2], function(tmp){
-    c <- names(clu)[clu %in% newbranch.pm[[tmp]]]
-    ctcomp <- rep(0, length(unique(alls)))
-    names(ctcomp) <- unique(alls)
-    ctcomp[names(table(alls[c]))] <- table(alls[c])
-  })
-  colnames(ctcomp) <- paste0('origin', js.melt[,2])
-  ctcomp <- ctcomp/rowSums(ctcomp)
-  
-  
-  ctcomp.new <- matrix(0, nrow = length(unique(alls)), ncol = length(newbranch))
-  colnames(ctcomp.new) <- paste0('origin', seq(1, length(newbranch)))
-  rownames(ctcomp.new) <- unique(alls)
-  ctcomp.new[rownames(ctcomp), colnames(ctcomp)] <- ctcomp
-  ctcomplist[[pmid]] <- t(ctcomp.new)
-  
-}
-
-# saveRDS(jslist, '/Users/wenpinhou/Dropbox/trajectory_variability/tree_variability/result/pm_js.rds')   
-# saveRDS(oclist, '/Users/wenpinhou/Dropbox/trajectory_variability/tree_variability/result/pm_oc.rds')   
-# 
-# saveRDS(corrlist, '/Users/wenpinhou/Dropbox/trajectory_variability/tree_variability/result/pm_oc.rds')   
-
-jsm <- do.call(rbind, jslist)
-ocm <- do.call(rbind, oclist)
-# par(mfrow = c(1,2))
-# hist(jsm)
-# hist(ocm)
-
-## moved within boostrap
-# reproduce <- corr.score <- list()
-# for (i in seq(1, length(jslist))){
-#   print(i)
-#   js <- jslist[[i]]
-#   js.binary <- get_binary(js)
-#   corr.score[[i]] <- corrlist[[i]] * js.binary
-#   js.melt <- melt(js.binary)
-#   js.melt <- js.melt[js.melt[,3]!=0,]
-#   colnames(js.melt) <- c('permutation.branch','original.branch','matched')
-#   reproduce[[i]] <- as.character(js.melt[,2])
-# }
-
-reproduce <- unlist(reproduce)  
-
-
-js.perc <- rep(0, length(newbranch))
-js.perc[as.numeric(names(table(reproduce)))] <-  table(reproduce)/n.permute
-names(js.perc) <- newbranch
-# saveRDS(js.perc, '/Users/wenpinhou/Dropbox/trajectory_variability/tree_variability/result/js_percentage.rds')
-
-corr.score.m <- do.call(rbind, corr.score)
-corr.score.v <- colSums(corr.score.m)/n.permute
-names(corr.score.v) <- newbranch
-# saveRDS(corr.score.v, '/Users/wenpinhou/Dropbox/trajectory_variability/tree_variability/result/corr_score.rds')
-
-res <- sapply(seq(1,length(oclist)), function(i){
-  print(i)
-  oc <- oclist[[i]]
-  oc.binary <- sapply(seq(1,ncol(oc)), function(c){
-    (oc[,c] > oc.cut[c]) + 0
-  })
-  while (length(which(rowSums(oc.binary) > 1)) > 0 | length(which(colSums(oc.binary) > 1)) > 0){
-    dup.id <- which(rowSums(oc.binary) > 1)
-    if (length(dup.id) == 1){
-      addid <- which.max(oc[dup.id, ])
-      oc.binary[dup.id, ] <- 0
-      oc.binary[dup.id, addid] <- 1  
-    } else if (length(dup.id) > 1) {
-      for (dup.i in dup.id){
-        addid <- which.max(oc[dup.i, ])
-        oc.binary[dup.i, ] <- 0
-        oc.binary[dup.i, addid] <- 1  
-      }
-    }
-    dup.id <- which(colSums(oc.binary) > 1)
-    if (length(dup.id) == 1){
-      addid <- which.max(oc[, dup.id])
-      oc.binary[, dup.id] <- 0
-      oc.binary[addid, dup.id] <- 1  
-    } else if (length(dup.id) > 1) {
-      for (dup.i in dup.id){
-        addid <- which.max(oc[, dup.i])
-        oc.binary[, dup.i] <- 0
-        oc.binary[addid, dup.i] <- 1  
-      }
-    }
-  }
-  oc.melt <- melt(oc.binary)
-  oc.melt <- oc.melt[oc.melt[,3]!=0,]
-  as.character(oc.melt[,2])
-})
-res <- unlist(res)  
-oc.perc <- rep(0, length(newbranch))
-oc.perc[as.numeric(names(table(res)))] <-  table(res)/n.permute
-names(oc.perc) <- newbranch
-sort((js.perc + oc.perc)/2)
-
-# saveRDS(oc.perc, '/Users/wenpinhou/Dropbox/trajectory_variability/tree_variability/result/oc_percentage.rds')
-
-detection.rate <- data.frame(detection.rate = (js.perc + oc.perc[names(js.perc)])/2, stringsAsFactors = FALSE)
-sample.cellcomp.mean <- apply(simplify2array(ctcomplist), 1:2, mean)
-sample.cellcomp.sd <- apply(simplify2array(ctcomplist), 1:2, sd)
-rownames(sample.cellcomp.mean) <- newbranch[as.numeric(sub('origin', '', rownames(sample.cellcomp.mean)))]
-rownames(sample.cellcomp.sd) <- newbranch[as.numeric(sub('origin', '', rownames(sample.cellcomp.sd)))]
-
-result <- list(detection.rate = detection.rate, 
-               sample.cellcomp.mean = sample.cellcomp.mean, 
-               sample.cellcomp.sd = sample.cellcomp.sd)
-
diff --git a/hca_bone_marrow_data_analysis/tree_variability/auto_pc_auto_nclu_module/code/04_try_to_build_module_v2.R b/hca_bone_marrow_data_analysis/tree_variability/auto_pc_auto_nclu_module/code/04_try_to_build_module_v2.R
deleted file mode 100644
index e6d73c3..0000000
--- a/hca_bone_marrow_data_analysis/tree_variability/auto_pc_auto_nclu_module/code/04_try_to_build_module_v2.R
+++ /dev/null
@@ -1,408 +0,0 @@
-rm(list=ls())
-library(ggplot2)
-library(Seurat)
-library(reshape2)
-library(TSCAN)
-library(scattermore)
-library(RColorBrewer)
-suppressMessages(library(igraph))
-n.permute <- 3
-max.clunum <- 50
-setwd("/Users/wenpinhou/Dropbox/trajectory_variability/hca/data/HCA/proc/integrate")
-plotdir <- '/Users/wenpinhou/Dropbox/trajectory_variability/tree_variability/plot/'
-# --------------------------------------------------------------
-# input: seurat integrated object including:
-#  umap, pca
-# celltype: a dataframe, col 1 is cell name, col 2 is cell type
-# origin: the origin cell type
-# --------------------------------------------------------------
-# read in data
-umap = readRDS('umap.rds')
-pca <- as.matrix(umap@reductions$pca@cell.embeddings)
-# ctlevel <- data.frame(ct=c('HSC','MPP','LMPP','CMP','CLP','GMP','MEP',"Bcell","CD4Tcell","CD8Tcell",'NKcell','Mono','Ery'),level=c(1,2,3,3,4,4,4,5,5,5,5,5,5),immunepath=c(1,1,1,0,1,0,0,1,1,1,1,0,0),monopath=c(1,1,1,1,0,1,0,0,0,0,0,1,0),erypath=c(1,1,0,1,0,0,1,0,0,0,0,0,1),stringsAsFactors = F)
-str(pca)
-a = readRDS('/Users/wenpinhou/Dropbox/trajectory_variability/hca/data/HCA/proc/ct/sc.rds')
-ct = data.frame(cell = names(a), celltype = a, stringsAsFactors = FALSE)
-  
-mykmeans <- function(matrix, number.cluster = NA, maxclunum = 20, seed = 12345){
-  ## cluster the rows
-  set.seed(seed)
-  library(parallel)
-  if (is.na(number.cluster)){
-    rss <- mclapply(1:maxclunum,function(clunum) {
-      set.seed(12345)
-      tmp <- kmeans(matrix,clunum,iter.max = 1000)
-      tmp$betweenss/tmp$totss
-    },mc.cores=20)
-    rss <- unlist(rss)
-    x <- 1:maxclunum
-    optclunum <- which.min(sapply(1:maxclunum, function(i) {
-        x2 <- pmax(0, x - i)
-        sum(lm(rss ~ x + x2)$residuals^2)  ## check this
-    }))
-    print(optclunum)
-    clu <- kmeans(matrix,optclunum)
-  } else {
-    clu <- kmeans(matrix, number.cluster)    
-  }
-    return(clu)
-}
-findbranch <- function(mst, order, origin){
-  deg <- degree(mst)
-  vertex <- names(deg[which(deg > 2 | deg == 1)])
-  if (!origin %in% vertex) vertex <- c(origin, vertex)
-  eg <- expand.grid(1:length(vertex), 1:length(vertex))
-  eg <- eg[eg[,1]<eg[,2],]
-  eg = data.frame(vertex[eg[,1]], vertex[eg[,2]], stringsAsFactors = FALSE)
-  library(igraph)
-  tmpbranch <- lapply(seq(1,nrow(eg)), function(i){
-    sp <- shortest_paths(mst, from = eg[i,1], to = eg[i,2])$vpath[[1]]
-    if (sum(vertex %in% sp) == 2) as.vector(sp)
-  })
-  tmpbranch <- tmpbranch[sapply(tmpbranch, length) >0]  
- 
-  allbranch <- gsub('backbone ', '', gsub('branch: ', '', names(order)))
-  allbranch <- sapply(allbranch, function(i) strsplit(i, ',')[[1]])
-  allbranch <- paste0(names(allbranch), collapse = ' ')
-  newbranch <-sapply(tmpbranch, function(i) {
-      tmp <- paste0(i, collapse = ',')
-      if (!grepl(tmp, allbranch)){
-        rev(i)
-      } else {
-        i
-      }
-  })
-  return(newbranch)
-}
-get_binary <- function(matrix, matrix.cut){
-  ## match boostrap and origin branches.
-  ## matrix: #boostrap.branch * #origin.branch, values are js or oc
-  ## matrix.cut: js or oc null distribution cutoff 
-  matrix.binary <- sapply(seq(1,ncol(matrix)), function(c){
-    (matrix[,c] > matrix.cut[c]) + 0
-  })
-  while (length(which(rowSums(matrix.binary) > 1)) > 0 | length(which(colSums(matrix.binary) > 1)) > 0){
-    dup.id <- which(rowSums(matrix.binary) > 1)
-    if (length(dup.id) == 1){
-      addid <- which.max(matrix[dup.id, ])
-      matrix.binary[dup.id, ] <- 0
-      matrix.binary[dup.id, addid] <- 1  
-    } else if (length(dup.id) > 1) {
-      for (dup.i in dup.id){
-        addid <- which.max(matrix[dup.i, ])
-        matrix.binary[dup.i, ] <- 0
-        matrix.binary[dup.i, addid] <- 1  
-      }
-    }
-      
-    dup.id <- which(colSums(matrix.binary) > 1)
-    if (length(dup.id) == 1){
-      addid <- which.max(matrix[, dup.id])
-      matrix.binary[, dup.id] <- 0
-      matrix.binary[addid, dup.id] <- 1  
-    } else if (length(dup.id) > 1) {
-      for (dup.i in dup.id){
-        addid <- which.max(matrix[, dup.i])
-        matrix.binary[, dup.i] <- 0
-        matrix.binary[addid, dup.i] <- 1  
-      }
-    }
-  }
-  return(matrix.binary)
-}
-infer_tree_structure <- function(pca, ct, origin.celltype, number.cluster = NA, plotdir = getwd()){
-  alls <- sub(':.*', '', ct$cell)
-  names(alls) <- ct$cell
-  set.seed(12345)
-  sdev <- apply(pca, 2, sd)
-  x <- 1:max.clunum
-  optpoint <- which.min(sapply(2:max.clunum, function(i) {
-    x2 <- pmax(0, x - i)
-    sum(lm(sdev[1:max.clunum] ~ x + x2)$residuals^2)
-  }))
-  pcadim = optpoint + 1
-  pr <- pca[,1:pcadim]  # 7
-  
-  ## clustering
-  # clu <- mykmeans(pr, number.cluster = number.cluster, maxclunum = 50, seed = i)$cluster
-  clu <- mykmeans(pr, maxclunum = 50, number.cluster = number.cluster)$cluster
-  table(clu)
-  pd = data.frame(x = pr[,1], y = pr[,2], clu = as.factor(clu[rownames(pr)]))
-  mypalette = colorRampPalette(brewer.pal(9,'Set1'))
-  pdf(paste0(plotdir, 'cluster.pdf'), width = 5, height = 4)
-  print(ggplot(data = pd, aes(x = x, y = y, color = clu)) +
-    geom_scattermore()+
-    scale_color_manual(values = mypalette(14))+
-    theme_classic() + xlab('PC1') + ylab('PC2'))
-  dev.off()
-  ## cell type composition in clusters
-  pd = cbind(pd, celltype = ct[match(rownames(pd), ct[,1]),2])
-  tab <- table(pd[,3:4])
-  tab <- tab/rowSums(tab)
-  pd <- melt(tab)
-  pd$clu <- factor(as.character(pd$clu), levels = seq(1,max(pd$clu)))
-  pdf(paste0(plotdir, 'celltype_composition_for_cluster.pdf'), width = 9, height = 5)
-  print(ggplot(data = pd) +
-    geom_bar(aes(x = clu, y = value, fill = celltype), stat = 'identity', position = 'dodge') +
-    theme_classic() +
-    ylab('Celltype Proportion') +
-    scale_fill_manual(values = mypalette(length(unique(pd$celltype)))))
-  dev.off()
-  ### mclust
-  mcl <- exprmclust(t(pr),cluster=clu,reduce=F)
-  # mcl <- exprmclust(t(pr), reduce = F)
-  pdf(paste0(plotdir, 'mcl.pdf'), width=8,height=8)
-  print(plotmclust(mcl, cell_point_size = 0.1))
-  dev.off()
-
-  # str(mcl)
-  # 
-  # --------------------
-  # construct pseudotime 
-  # --------------------
-  ## find origin
-  pd = data.frame(x = pr[,1], y = pr[,2], clu = as.factor(mcl$clusterid))
-  pd = cbind(pd, celltype = ct[match(rownames(pd), ct[,1]),2])
-  tab <- table(pd[,3:4])
-  tab <- tab/rowSums(tab)
-  pd <- melt(tab)
-  pd$clu <- factor(as.character(pd$clu), levels = seq(1,max(pd$clu)))
-  tmp <- pd[pd$celltype == origin.celltype, ]
-  origin.cluster <- as.numeric(tmp[which.max(tmp[,3]), 1])
-  
-  ## construct pseudotime
-  ord <- TSCANorder(mcl, startcluster = origin.cluster, listbranch = T,orderonly = T)
-  str(ord)
-  length(ord)
-  pt <- unlist(sapply(sapply(ord, length), function(i) seq(1, i)))
-  names(pt) <- unname(unlist(ord))
-  
-  # ## plot pseudotime
-  pd = data.frame(pc1 = pca[,1], pc2 = pca[,2], time = as.numeric(pt[rownames(pca)]))
-  library(scattermore)
-  library(RColorBrewer)
-  pdf(paste0(plotdir, 'pseudotime.pdf'), width = 7, height = 6)  
-  print(ggplot(data = pd, aes(x = pc1, y = pc2, color = time)) +
-    geom_scattermore() +
-    scale_color_gradient(low = 'yellow', high = 'blue'))
-  dev.off()
-  # ------------------------------------------------------------
-  # get candidate branches to test reproducibility, 20200726 >>
-  # ------------------------------------------------------------
-  
-  newbranch <- findbranch(mst = mcl$MSTtree, order = ord, origin = origin.cluster)  
-  
-  # -----------------------------------------------------
-  # Evaluate robustness of tree branches using resampling
-  # -----------------------------------------------------
-  
-  # null distribution of Jaccard index, overlap coefficient
-  
-  js.null <- lapply(seq(1, length(newbranch)), function(i) {
-    b.ori <- unlist(sapply(newbranch[[i]], function(c) names(mcl$clusterid[mcl$clusterid == c])))
-    tmp <- sapply(seq(1, 1e3), function(j){
-      set.seed(j)
-      b.pm <- sample(rownames(pr), length(b.ori))
-      length(intersect(b.pm, b.ori))/length(union(b.pm, b.ori))
-    })
-  })
-  
-  # par(mfrow = c(2,ceiling(length(js.null)/2)))
-  # for (i in js.null) hist(i, xlab = 'js', main = '', breaks = 50)
-  
-  js.cut <- sapply(js.null, quantile, 0.99)
-  
-  oc.null <- lapply(seq(1, length(newbranch)), function(i){
-    b.ori <- unlist(sapply(newbranch[[i]], function(c) names(mcl$clusterid[mcl$clusterid == c])))
-    tmp <- sapply(seq(1, 1e3), function(j){
-      set.seed(j)
-      b.pm <- sample(rownames(pr), length(b.ori))
-      length(intersect(b.pm, b.ori))/min(length(b.pm), length(b.ori))
-    })
-  })
-  # par(mfrow = c(2,ceiling(length(oc.null)/2)))
-  # for (i in oc.null) hist(i, xlab = 'oc', main = '', breaks = 50)
-  oc.cut <- sapply(oc.null, quantile, 0.99)
-  
-  mcl$pseudotime <- pt
-  mcl$branch <- newbranch
-  mcl$js.cut <- js.cut
-  mcl$oc.cut <- oc.cut
-  mcl$pca <- pr
-  mcl$order <- ord
-  mcl$allsample <- alls
-  return(mcl)
-}
-evaluate_uncertainty <- function(inferobj, n.permute){
-  pr <- inferobj$pca
-  newbranch <- inferobj$branch
-  js.cut <- inferobj$js.cut
-  oc.cut <- inferobj$oc.cut 
-  pt <- inferobj$pseudotime
-  ord <- inferobj$order
-  alls <- inferobj$allsample
-  ctcomplist <- reproduce.js <- reproduce.oc <- corr.score <- list()
-  for (pmid in seq(1, n.permute)){
-    print(pmid)
-    ## boostrap cells
-    set.seed(pmid)
-    pr.pm <- pr[sample(rownames(pr), nrow(pr), replace = TRUE),]
-    pr.pm <- pr.pm[!duplicated(rownames(pr.pm)),]
-    
-    ## cluster cells
-    clu <- mykmeans(pr.pm, number.cluster = 14)$cluster ###
-  
-    # --- check if these codes are necessary <<<<<<<<<<<<<<<<
-    # pd = data.frame(x = pr[names(clu),1], y = pr[names(clu),2], clu = as.factor(clu))
-    # pd.text.x = tapply(pd[,1], list(pd$clu), mean)
-    # pd.text.y = tapply(pd[,2], list(pd$clu), mean)
-    # pd.text = data.frame(x = pd.text.x, y = pd.text.y, clu = names(pd.text.x))
-    # pd.text[14,1:2] =  c(pd.text[14,1] + 2, pd.text[14,2] + 1)
-    # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>
-    # ggplot() + 
-    #   geom_scattermore(data = pd, aes(x = x, y = y, color = clu))+
-    #   scale_color_manual(values = mypalette(14))+
-    #   theme_classic() + xlab('UMAP1') + ylab('UMAP2') +
-    #   geom_text(data = pd.text, aes(x = x, y = y, label = clu))
-    
-  
-    ## cell type composition in clusters
-    # pd = cbind(pd, celltype = ct[match(rownames(pd), ct[,1]),2])
-    # tab <- table(pd[,3:4])
-    # tab <- tab/rowSums(tab)
-    # pd <- melt(tab)
-    # pd$clu <- factor(as.character(pd$clu), levels = seq(1,max(pd$clu)))
-    # ggplot(data = pd) +
-    #   geom_bar(aes(x = clu, y = value, fill = celltype), stat = 'identity', position = 'dodge') +
-    #   theme_classic() +
-    #   ylab('Celltype Proportion') +
-    #   scale_fill_manual(values = mypalette(length(unique(pd$celltype))))
-  
-    ## build pseudotime
-    mcl.pm <- exprmclust(t(pr.pm), cluster = clu, reduce = FALSE) ###
-    # plotmclust(mcl.pm, cell_point_size = 0.1)
-    
-    ## select origin cluster
-    pt.pm.mean<- tapply(pt[names(mcl.pm[['clusterid']])], list(mcl.pm[['clusterid']]), mean)
-    start.cluster <- names(which.min(pt.pm.mean))
-    
-    ## construct pseudotime
-    ord.pm <- TSCANorder(mcl.pm, startcluster = start.cluster, listbranch = T,orderonly = T)
-    # str(ord.pm)
-    
-    pt.pm <- unlist(sapply(sapply(ord.pm, length), function(i) seq(1, i)))
-    names(pt.pm) <- unname(unlist(ord.pm))
-    # --- check if these codes are necessary <<<<<<<<<<<<<<<<
-    ## plot pseudotime
-    
-    pd = data.frame(pc1 = pca[,1], pc2 = pca[,2], time = as.numeric(pt.pm[rownames(pca)]))
-    # ggplot(data = pd, aes(x = pc1, y = pc2, color = time)) +
-    #   geom_scattermore() + theme_classic()
-    # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>
-    
-    # get candidate branches
-    newbranch.pm <- findbranch(mst = mcl.pm$MSTtree, order = ord.pm, origin = start.cluster)
-    
-    ## compare two MST
-    js <- sapply(seq(1, length(newbranch)), function(i){
-            id <- which(sapply(paste0(names(ord),','), function(k) grepl(paste0(paste0(newbranch[[i]], collapse = ','),','), k)))[1]
-            cells <- ord[[id]]
-            b.ori <- intersect(unlist(sapply(newbranch[[i]], function(k) names(inferobj$clusterid)[inferobj$clusterid == k])), cells)
-            sapply(seq(1, length(newbranch.pm)), function(j){
-              
-              id <- which(sapply(paste0(names(ord.pm),','), function(k) grepl(paste0(paste0(newbranch.pm[[j]], collapse = ','),','), k)))[1]
-              cells <- ord.pm[[id]]
-              b.pm <- intersect(unlist(sapply(newbranch.pm[[j]], function(k) names(mcl.pm$clusterid)[mcl.pm$clusterid == k])), cells)
-              js <- length(intersect(b.pm, b.ori))/length(union(b.pm, b.ori))
-            })
-          })
-    oc <- sapply(seq(1, length(newbranch)), function(i){
-              id <- which(sapply(paste0(names(ord),','), function(k) grepl(paste0(paste0(newbranch[[i]], collapse = ','),','), k)))[1]
-              cells <- ord[[id]]
-              b.ori <- intersect(unlist(sapply(newbranch[[i]], function(k) names(inferobj$clusterid)[inferobj$clusterid == k])), cells)
-              sapply(seq(1, length(newbranch.pm)), function(j){
-                  id <- which(sapply(paste0(names(ord.pm),','), function(k) grepl(paste0(paste0(newbranch.pm[[j]], collapse = ','),','), k)))[1]
-                  cells <- ord.pm[[id]]
-                  b.pm <- intersect(unlist(sapply(newbranch.pm[[j]], function(k) names(mcl.pm$clusterid)[mcl.pm$clusterid == k])), cells)
-                  oc <- length(intersect(b.pm, b.ori))/min(length(b.pm), length(b.ori))
-             }) 
-          })
-    corr <- sapply(seq(1, length(newbranch)), function(i){
-                id <- which(sapply(paste0(names(ord),','), function(k) grepl(paste0(paste0(newbranch[[i]], collapse = ','),','), k)))[1]
-                cells <- ord[[id]]
-                b.ori <- intersect(unlist(sapply(newbranch[[i]], function(k) names(inferobj$clusterid)[inferobj$clusterid == k])), cells)
-                
-                sapply(seq(1, length(newbranch.pm)), function(j){
-                    id <- which(sapply(paste0(names(ord.pm),','), function(k) grepl(paste0(paste0(newbranch.pm[[j]], collapse = ','),','), k)))[1]
-                    cells <- ord.pm[[id]]
-                    b.pm <- intersect(unlist(sapply(newbranch.pm[[j]], function(k) names(mcl.pm$clusterid)[mcl.pm$clusterid == k])), cells)
-                    ov = intersect(b.ori, b.pm)
-                    cor(pt[ov], pt.pm[ov])
-                }) 
-            })
-    corr[is.na(corr)] <- 0
-    colnames(corr) <- colnames(oc) <- colnames(js) <- paste0('original', seq(1, length(newbranch)))
-    
-    ## get js binary to match branches 
-    js.binary <- get_binary(js, js.cut)
-    corr.score[[pmid]] <- corr * js.binary
-    js.melt <- melt(js.binary)
-    js.melt <- js.melt[js.melt[,3]!=0,]
-    colnames(js.melt) <- c('permutation.branch','original.branch','matched')
-    reproduce.js[[pmid]] <- as.character(js.melt[,2])
-    
-    ## get oc binary to match branches
-    oc.binary <- get_binary(oc, oc.cut)
-    oc.melt <- melt(oc.binary)
-    oc.melt <- oc.melt[oc.melt[,3]!=0,]
-    reproduce.oc[[pmid]] <- as.character(oc.melt[,2])
-  
-    ## samples cell compositions 
-    ctcomp <- sapply(js.melt[,2], function(tmp){
-      c <- names(clu)[clu %in% newbranch.pm[[tmp]]]
-      ctcomp <- rep(0, length(unique(alls)))
-      names(ctcomp) <- unique(alls)
-      ctcomp[names(table(alls[c]))] <- table(alls[c])
-    })
-    colnames(ctcomp) <- paste0('origin', js.melt[,2])
-    ctcomp <- ctcomp/rowSums(ctcomp)
-    
-    ctcomp.new <- matrix(0, nrow = length(unique(alls)), ncol = length(newbranch))
-    colnames(ctcomp.new) <- paste0('origin', seq(1, length(newbranch)))
-    rownames(ctcomp.new) <- unique(alls)
-    ctcomp.new[rownames(ctcomp), colnames(ctcomp)] <- ctcomp
-    ctcomplist[[pmid]] <- t(ctcomp.new)
-  }
-  
-  reproduce.js <- unlist(reproduce.js)  
-  js.perc <- rep(0, length(newbranch))
-  js.perc[as.numeric(names(table(reproduce.js)))] <-  table(reproduce.js)/n.permute
-  names(js.perc) <- newbranch
-  
-  reproduce.oc <- unlist(reproduce.oc)  
-  oc.perc <- rep(0, length(newbranch))
-  oc.perc[as.numeric(names(table(reproduce.oc)))] <-  table(reproduce.oc)/n.permute
-  names(oc.perc) <- newbranch
-  
-  corr.score.m <- do.call(rbind, corr.score)
-  corr.score.v <- colSums(corr.score.m)/n.permute
-  names(corr.score.v) <- newbranch
-  
-  sort((js.perc + oc.perc)/2)
-  
-  detection.rate <- data.frame(detection.rate = (js.perc + oc.perc[names(js.perc)])/2, stringsAsFactors = FALSE)
-  sample.cellcomp.mean <- apply(simplify2array(ctcomplist), 1:2, mean)
-  sample.cellcomp.sd <- apply(simplify2array(ctcomplist), 1:2, sd)
-  rownames(sample.cellcomp.mean) <- newbranch[as.numeric(sub('origin', '', rownames(sample.cellcomp.mean)))]
-  rownames(sample.cellcomp.sd) <- newbranch[as.numeric(sub('origin', '', rownames(sample.cellcomp.sd)))]
-  
-  result <- list(detection.rate = detection.rate, 
-                 sample.cellcomp.mean = sample.cellcomp.mean, 
-                 sample.cellcomp.sd = sample.cellcomp.sd)
-  return(result)
-}
-
-# permutation 
-a = infer_tree_structure(pca = pca, ct = ct, origin.celltype = 'HSC', plotdir = plotdir)
-result <- evaluate_uncertainty(a, 100)
-saveRDS(result, '/Users/wenpinhou/Dropbox/trajectory_variability/tree_variability/result/result.rds')
diff --git a/hca_bone_marrow_data_analysis/tree_variability/auto_pc_auto_nclu_module/code/05_try_to_build_module_v5.R b/hca_bone_marrow_data_analysis/tree_variability/auto_pc_auto_nclu_module/code/05_try_to_build_module_v5.R
deleted file mode 100644
index f0c34b4..0000000
--- a/hca_bone_marrow_data_analysis/tree_variability/auto_pc_auto_nclu_module/code/05_try_to_build_module_v5.R
+++ /dev/null
@@ -1,38 +0,0 @@
-rm(list=ls())
-library(ggplot2)
-library(Seurat)
-library(reshape2)
-library(TSCAN)
-library(scattermore)
-library(RColorBrewer)
-suppressMessages(library(igraph))
-n.permute <- 3
-max.clunum <- 50
-source("/Users/wenpinhou/Dropbox/trajectory_variability/function/01_function.R")
-plotdir <- '/Users/wenpinhou/Dropbox/trajectory_variability/tree_variability/auto_pc_auto_nclu_module/plot/'
-rdir <- '/Users/wenpinhou/Dropbox/trajectory_variability/tree_variability/auto_pc_auto_nclu_module/result/'
-# --------------------------------------------------------------
-# input: seurat integrated object including:
-# low dim reduction: umap, pca, or phate
-# celltype: a dataframe, col 1 is cell name, col 2 is cell type (at least for the cells with origin cell type), col 3 is sample name
-# origin: the origin cell type
-# --------------------------------------------------------------
-# read in data
-umap = readRDS('umap.rds')
-pca <- as.matrix(umap@reductions$pca@cell.embeddings)
-a = readRDS('/Users/wenpinhou/Dropbox/trajectory_variability/hca/data/HCA/proc/ct/sc.rds')
-ct = data.frame(cell = names(a), celltype = a, sample = sapply(names(a), function(i) sub(':.*', '', i)), stringsAsFactors = FALSE)
-  
-# permutation 
-a = infer_tree_structure(pca = pca, ct = ct, origin.celltype = 'HSC', plotdir = plotdir, xlab='Principal Component1', ylab = 'Principal Component 2')
-pdf(paste0(plotdir, 'mcl.pdf'), width=6,height=5)
-print(plotmclust(a, cell_point_size = 0.1, x.lab = 'PC1', y.lab = 'PC2'))
-dev.off()
-result <- evaluate_uncertainty(a, 100)
-saveRDS(result, paste0(rdir, 'result.rds'))
-
-for (i in names(result)){
-  write.csv(result[[i]], paste0(rdir, i, '.csv'), row.names = T)
-}
-
-
diff --git a/hca_bone_marrow_data_analysis/tree_variability/auto_pc_auto_nclu_module_3traj/code/01_reproducibility.R b/hca_bone_marrow_data_analysis/tree_variability/auto_pc_auto_nclu_module_3traj/code/01_reproducibility.R
deleted file mode 100644
index 6a0319e..0000000
--- a/hca_bone_marrow_data_analysis/tree_variability/auto_pc_auto_nclu_module_3traj/code/01_reproducibility.R
+++ /dev/null
@@ -1,274 +0,0 @@
-rm(list=ls())
-library(ggplot2)
-library(Seurat)
-library(reshape2)
-library(TSCAN)
-library(scattermore)
-library(RColorBrewer)
-suppressMessages(library(igraph))
-n.permute <- 1e3
-setwd("/Users/wenpinhou/Dropbox/trajectory_variability/hca/data/HCA/proc/integrate")
-umap = readRDS('umap.rds')
-pca <- as.matrix(umap@reductions$pca@cell.embeddings)
-ctlevel <- data.frame(ct=c('HSC','MPP','LMPP','CMP','CLP','GMP','MEP',"Bcell","CD4Tcell","CD8Tcell",'NKcell','Mono','Ery'),level=c(1,2,3,3,4,4,4,5,5,5,5,5,5),immunepath=c(1,1,1,0,1,0,0,1,1,1,1,0,0),monopath=c(1,1,1,1,0,1,0,0,0,0,0,1,0),erypath=c(1,1,0,1,0,0,1,0,0,0,0,0,1),stringsAsFactors = F)
-str(pca)
-a = readRDS('/Users/wenpinhou/Dropbox/trajectory_variability/hca/data/HCA/proc/ct/sc.rds')
-ct = data.frame(cell = names(a), celltype = a, stringsAsFactors = FALSE)
-
-mykmeans <- function(matrix, number.cluster = NA){
-  ## cluster the rows
-  set.seed(12345)
-  library(parallel)
-  if (is.na(number.cluster)){
-    maxclunum <- 20
-    rss <- mclapply(1:maxclunum,function(clunum) {
-      tmp <- kmeans(matrix,clunum,iter.max = 1000)
-      tmp$betweenss/tmp$totss
-    },mc.cores=20)
-    rss <- unlist(rss)
-    x <- 1:maxclunum
-    optclunum <- which.min(sapply(1:maxclunum, function(i) {
-        x2 <- pmax(0, x - i)
-        sum(lm(rss ~ x + x2)$residuals^2)  ## check this
-    }))
-    clu <- kmeans(matrix,optclunum)
-  } else {
-    clu <- kmeans(matrix, number.cluster)    
-  }
-    return(clu)
-}
-
-### determine numPC
-set.seed(12345)
-sdev <- apply(pca, 2, sd)
-x <- 1:20
-optpoint <- which.min(sapply(2:20, function(i) {
-  x2 <- pmax(0, x - i)
-  sum(lm(sdev[1:20] ~ x + x2)$residuals^2)
-}))
-pcadim = optpoint + 1
-pr <- pca[,1:pcadim]  # 2
-
-### mclust
-# mcl <- exprmclust(t(pr),cluster=clu,reduce=F)
-mcl <- exprmclust(t(pr), reduce = F)
-plotmclust(mcl, cell_point_size = 0.1)
-str(mcl)
-## find origin
-pd = data.frame(x = pr[,1], y = pr[,2], clu = as.factor(mcl$clusterid))
-pd = cbind(pd, celltype = ct[match(rownames(pd), ct[,1]),2])
-tab <- table(pd[,3:4])
-tab <- tab/rowSums(tab)
-pd <- melt(tab)
-pd$clu <- factor(as.character(pd$clu), levels = seq(1,max(pd$clu)))
-
-tmp <- pd[pd$celltype == 'HSC', ]
-origin.cluster <- as.numeric(tmp[which.max(tmp[,3]), 1])
-
-## construct pseudotime
-ord <- TSCANorder(mcl, startcluster = origin.cluster, listbranch = T,orderonly = T)
-str(ord)
-length(ord)
-pt <- unlist(sapply(sapply(ord, length), function(i) seq(1, i)))
-names(pt) <- unname(unlist(ord))
-
-
-## plot pseudotime
-pd = data.frame(pc1 = pca[,1], pc2 = pca[,2], time = as.numeric(pt[rownames(pca)]))
-library(scattermore)
-library(RColorBrewer)
-ggplot(data = pd, aes(x = pc1, y = pc2, color = time)) +
-  geom_scattermore() +
-  scale_color_gradient(low = 'yellow', high = 'blue')
-
-# -------------------------------------------------------
-# null distribution of Jaccard index, overlap coefficient
-# -------------------------------------------------------
-js.null <- lapply(seq(1, length(ord)), function(i){
-  b.ori <- ord[[i]]
-  tmp <- sapply(seq(1, 1e3), function(j){
-    set.seed(j)
-    b.pm <- sample(rownames(pr), length(b.ori))
-    length(intersect(b.pm, b.ori))/length(union(b.pm, b.ori))
-  })
-})
-
-par(mfrow = c(1,3))
-hist(js.null[[1]])
-hist(js.null[[2]])
-hist(js.null[[3]])
-
-js.cut <- sapply(js.null, quantile, 0.99)
-
-oc.null <- lapply(seq(1, length(ord)), function(i){
-  b.ori <- ord[[i]]
-  tmp <- sapply(seq(1, 1e3), function(j){
-    set.seed(j)
-    b.pm <- sample(rownames(pr), length(b.ori))
-    length(intersect(b.pm, b.ori))/min(length(b.pm), length(b.ori))
-  })
-})
-par(mfrow = c(1,3))
-hist(oc.null[[1]])
-hist(oc.null[[2]])
-hist(oc.null[[3]])
-
-oc.cut <- sapply(oc.null, quantile, 0.99)
-
-# -----------
-# permutation 
-# -----------
-corrlist <- jslist <- oclist <- list()
-for (pmid in seq(1, n.permute)){
-  ## boostrap cells
-  print(pmid)
-  set.seed(pmid)
-  pr.pm <- pr[sample(rownames(pr), nrow(pr), replace = TRUE),]
-  pr.pm <- pr.pm[!duplicated(rownames(pr.pm)),]
-  
-  # ## cluster cells
-  mcl.pm <- exprmclust(t(pr.pm), reduce = FALSE) ###
-  # plotmclust(mcl.pm, cell_point_size = 0.1)
-  
-  ## select origin cluster
-  pt.pm.mean<- tapply(pt[names(mcl.pm[['clusterid']])], list(mcl.pm[['clusterid']]), mean)
-  start.cluster <- names(which.min(pt.pm.mean))
-  
-  ## construct pseudotime
-  ord.pm <- TSCANorder(mcl.pm, startcluster = start.cluster, listbranch = T,orderonly = T)
-  # str(ord.pm)
-  
-  ## plot pseudotime
-  pt.pm <- unlist(sapply(sapply(ord.pm, length), function(i) seq(1, i)))
-  names(pt.pm) <- unname(unlist(ord.pm))
-  pd = data.frame(pc1 = pca[,1], pc2 = pca[,2], time = as.numeric(pt.pm[rownames(pca)]))
-  # ggplot(data = pd, aes(x = pc1, y = pc2, color = time)) +
-  #   geom_scattermore()
-
-  ## compare two MST
-  js <- sapply(seq(1, length(ord)), function(i){
-          sapply(seq(1, length(ord.pm)), function(j){
-            b.ori <- ord[[i]]
-            b.pm <- ord.pm[[j]]
-            js <- length(intersect(b.pm, b.ori))/length(union(b.pm, b.ori))
-          })
-      })
-  oc <- sapply(seq(1, length(ord)), function(i){
-           sapply(seq(1, length(ord.pm)), function(j){
-              b.ori <- ord[[i]]
-              b.pm <- ord.pm[[j]]
-              oc <- length(intersect(b.pm, b.ori))/min(length(b.pm), length(b.ori))
-           }) 
-        })
-  corr <- sapply(seq(1, length(ord)), function(i){
-           sapply(seq(1, length(ord.pm)), function(j){
-              ov = intersect(ord[[i]], ord.pm[[j]])
-              cor(pt[ov], pt.pm[ov])
-           }) 
-        })
-  corr[is.na(corr)] <- 0
-  colnames(corr) <- colnames(oc) <- colnames(js) <- paste0('original', seq(1, length(ord)))
-  jslist[[pmid]] <- js
-  oclist[[pmid]] <- oc
-  corrlist[[pmid]] <- corr
-}
-saveRDS(jslist, '/Users/wenpinhou/Dropbox/trajectory_variability/tree_variability/result/pm_js.rds')   
-saveRDS(oclist, '/Users/wenpinhou/Dropbox/trajectory_variability/tree_variability/result/pm_oc.rds')   
-
-saveRDS(corrlist, '/Users/wenpinhou/Dropbox/trajectory_variability/tree_variability/result/pm_oc.rds')   
-
-jsm <- do.call(rbind, jslist)
-ocm <- do.call(rbind, oclist)
-par(mfrow = c(1,2))
-hist(jsm)
-hist(ocm)
-
-res <- corr.score <- list()
-for (i in seq(1, length(jslist))){
-  js <- jslist[[i]]
-  js.binary <- sapply(seq(1,ncol(js)), function(c){
-    (js[,c] > js.cut[c]) + 0
-  })
-  while (length(which(rowSums(js.binary) > 1)) > 0 | length(which(colSums(js.binary) > 1)) > 0){
-    dup.id <- which(rowSums(js.binary) > 1)
-    if (length(dup.id) == 1){
-      addid <- which.max(js[dup.id, ])
-      js.binary[dup.id, ] <- 0
-      js.binary[dup.id, addid] <- 1  
-    } else if (length(dup.id) > 1) {
-      for (dup.i in dup.id){
-        addid <- which.max(js[dup.i, ])
-        js.binary[dup.i, ] <- 0
-        js.binary[dup.i, addid] <- 1  
-      }
-    }
-      
-    dup.id <- which(colSums(js.binary) > 1)
-    if (length(dup.id) == 1){
-      addid <- which.max(js[, dup.id])
-      js.binary[dup.id, ] <- 0
-      js.binary[addid, dup.id] <- 1  
-    } else if (length(dup.id) > 1) {
-      for (dup.i in dup.id){
-        addid <- which.max(js[, dup.id])
-        js.binary[, dup.id] <- 0
-        js.binary[addid, dup.id] <- 1  
-      }
-    }
-  }
-  
-  
-  corr.score[[i]] <- corrlist[[i]] * js.binary
-  js.melt <- melt(js.binary)
-  js.melt <- js.melt[js.melt[,3]!=0,]
-  res[[i]] <- as.character(js.melt[,2])
-}
-res <- unlist(res)  
-js.perc <- table(res)/n.permute
-saveRDS(js.perc, '/Users/wenpinhou/Dropbox/trajectory_variability/tree_variability/result/js_percentage.rds')
-
-corr.score.m <- do.call(rbind, corr.score)
-corr.score.v <- colSums(corr.score.m)/n.permute
-saveRDS(corr.score.v, '/Users/wenpinhou/Dropbox/trajectory_variability/tree_variability/result/corr_score.rds')
-
-res <- sapply(seq(1,length(oclist)), function(i){
-  oc <- oclist[[i]]
-  oc.binary <- sapply(seq(1,ncol(oc)), function(c){
-    (oc[,c] > oc.cut[c]) + 0
-  })
-  while (length(which(rowSums(oc.binary) > 1)) > 0 | length(which(colSums(oc.binary) > 1)) > 0){
-    dup.id <- which(rowSums(oc.binary) > 1)
-    if (length(dup.id) == 1){
-      addid <- which.max(oc[dup.id, ])
-      oc.binary[dup.id, ] <- 0
-      oc.binary[dup.id, addid] <- 1  
-    } else if (length(dup.id) > 1) {
-      for (dup.i in dup.id){
-        addid <- which.max(oc[dup.i, ])
-        oc.binary[dup.i, ] <- 0
-        oc.binary[dup.i, addid] <- 1  
-      }
-    }
-    dup.id <- which(colSums(oc.binary) > 1)
-    if (length(dup.id) == 1){
-      addid <- which.max(oc[, dup.id])
-      oc.binary[, dup.id] <- 0
-      oc.binary[addid, dup.id] <- 1  
-    } else if (length(dup.id) > 1) {
-      for (dup.i in dup.id){
-        addid <- which.max(oc[, dup.i])
-        oc.binary[, dup.i] <- 0
-        oc.binary[addid, dup.i] <- 1  
-      }
-    }
-  }
-  oc.melt <- melt(oc.binary)
-  oc.melt <- oc.melt[oc.melt[,3]!=0,]
-  as.character(oc.melt[,2])
-})
-res <- unlist(res)  
-oc.perc <- table(res)/n.permute
-sort((js.perc + oc.perc)/2)
-saveRDS(oc.perc, '/Users/wenpinhou/Dropbox/trajectory_variability/tree_variability/result/oc_percentage.rds')
-
-
diff --git a/hca_bone_marrow_data_analysis/tree_variability/auto_pc_auto_nclu_module_3traj/code/02_samples_reproducibility.R b/hca_bone_marrow_data_analysis/tree_variability/auto_pc_auto_nclu_module_3traj/code/02_samples_reproducibility.R
deleted file mode 100644
index 82056b3..0000000
--- a/hca_bone_marrow_data_analysis/tree_variability/auto_pc_auto_nclu_module_3traj/code/02_samples_reproducibility.R
+++ /dev/null
@@ -1,452 +0,0 @@
-rm(list=ls())
-library(ggplot2)
-library(Seurat)
-library(reshape2)
-library(TSCAN)
-library(scattermore)
-library(RColorBrewer)
-suppressMessages(library(igraph))
-n.permute <- 1e3
-max.clunum <- 50
-setwd("/Users/wenpinhou/Dropbox/trajectory_variability/hca/data/HCA/proc/integrate")
-umap = readRDS('umap.rds')
-pca <- as.matrix(umap@reductions$pca@cell.embeddings)
-ctlevel <- data.frame(ct=c('HSC','MPP','LMPP','CMP','CLP','GMP','MEP',"Bcell","CD4Tcell","CD8Tcell",'NKcell','Mono','Ery'),level=c(1,2,3,3,4,4,4,5,5,5,5,5,5),immunepath=c(1,1,1,0,1,0,0,1,1,1,1,0,0),monopath=c(1,1,1,1,0,1,0,0,0,0,0,1,0),erypath=c(1,1,0,1,0,0,1,0,0,0,0,0,1),stringsAsFactors = F)
-str(pca)
-a = readRDS('/Users/wenpinhou/Dropbox/trajectory_variability/hca/data/HCA/proc/ct/sc.rds')
-ct = data.frame(cell = names(a), celltype = a, stringsAsFactors = FALSE)
-
-mykmeans <- function(matrix, number.cluster = NA){
-  ## cluster the rows
-  set.seed(12345)
-  library(parallel)
-  if (is.na(number.cluster)){
-    maxclunum <- 20
-    rss <- mclapply(1:maxclunum,function(clunum) {
-      tmp <- kmeans(matrix,clunum,iter.max = 1000)
-      tmp$betweenss/tmp$totss
-    },mc.cores=20)
-    rss <- unlist(rss)
-    x <- 1:maxclunum
-    optclunum <- which.min(sapply(1:maxclunum, function(i) {
-        x2 <- pmax(0, x - i)
-        sum(lm(rss ~ x + x2)$residuals^2)  ## check this
-    }))
-    clu <- kmeans(matrix,optclunum)
-  } else {
-    clu <- kmeans(matrix, number.cluster)    
-  }
-    return(clu)
-}
-
-### determine numPC
-set.seed(12345)
-sdev <- apply(pca, 2, sd)
-x <- 1:max.clunum
-optpoint <- which.min(sapply(2:max.clunum, function(i) {
-  x2 <- pmax(0, x - i)
-  sum(lm(sdev[1:max.clunum] ~ x + x2)$residuals^2)
-}))
-pcadim = optpoint + 1
-pr <- pca[,1:pcadim]  # 7
-
-## clustering
-clu <- mykmeans(pr, number.cluster = 14)$cluster
-pd = data.frame(x = pr[,1], y = pr[,2], clu = as.factor(clu[rownames(pr)]))
-
-# mypalette = colorRampPalette(brewer.pal(9,'Set1'))
-# ggplot(data = pd, aes(x = x, y = y, color = clu)) + 
-#   geom_scattermore()+
-#   scale_color_manual(values = mypalette(14))+
-#   theme_classic() + xlab('UMAP1') + ylab('UMAP2')
-
-## cell type composition in clusters
-pd = cbind(pd, celltype = ct[match(rownames(pd), ct[,1]),2])
-tab <- table(pd[,3:4])
-tab <- tab/rowSums(tab)
-pd <- melt(tab)
-pd$clu <- factor(as.character(pd$clu), levels = seq(1,max(pd$clu)))
-
-# ggplot(data = pd) +
-#   geom_bar(aes(x = clu, y = value, fill = celltype), stat = 'identity', position = 'dodge') +
-#   theme_classic() +
-#   ylab('Celltype Proportion') +
-#   scale_fill_manual(values = mypalette(length(unique(pd$celltype))))
-
-### mclust
-mcl <- exprmclust(t(pr),cluster=clu,reduce=F)
-# mcl <- exprmclust(t(pr), reduce = F)
-# plotmclust(mcl, cell_point_size = 0.1)
-# str(mcl)
-
-# --------------------
-# construct pseudotime 
-# --------------------
-## find origin
-pd = data.frame(x = pr[,1], y = pr[,2], clu = as.factor(mcl$clusterid))
-pd = cbind(pd, celltype = ct[match(rownames(pd), ct[,1]),2])
-tab <- table(pd[,3:4])
-tab <- tab/rowSums(tab)
-pd <- melt(tab)
-pd$clu <- factor(as.character(pd$clu), levels = seq(1,max(pd$clu)))
-tmp <- pd[pd$celltype == 'HSC', ]
-origin.cluster <- as.numeric(tmp[which.max(tmp[,3]), 1])
-
-## construct pseudotime
-ord <- TSCANorder(mcl, startcluster = origin.cluster, listbranch = T,orderonly = T)
-str(ord)
-length(ord)
-pt <- unlist(sapply(sapply(ord, length), function(i) seq(1, i)))
-names(pt) <- unname(unlist(ord))
-
-# ## plot pseudotime
-# pd = data.frame(pc1 = pca[,1], pc2 = pca[,2], time = as.numeric(pt[rownames(pca)]))
-# library(scattermore)
-# library(RColorBrewer)
-# ggplot(data = pd, aes(x = pc1, y = pc2, color = time)) +
-#   geom_scattermore() +
-#   scale_color_gradient(low = 'yellow', high = 'blue')
-
-# ------------------------------------------------------------
-# get candidate branches to test reproducibility, 20200726 >>
-# ------------------------------------------------------------
-findbranch <- function(mst, order, origin){
-  deg <- degree(mst)
-  vertex <- names(deg[which(deg > 2 | deg == 1)])
-  if (!origin %in% vertex) vertex <- c(origin, vertex)
-  eg <- expand.grid(1:length(vertex), 1:length(vertex))
-  eg <- eg[eg[,1]<eg[,2],]
-  eg = data.frame(vertex[eg[,1]], vertex[eg[,2]], stringsAsFactors = FALSE)
-  library(igraph)
-  tmpbranch <- lapply(seq(1,nrow(eg)), function(i){
-    sp <- shortest_paths(mst, from = eg[i,1], to = eg[i,2])$vpath[[1]]
-    if (sum(vertex %in% sp) == 2) as.vector(sp)
-  })
-  tmpbranch <- tmpbranch[sapply(tmpbranch, length) >0]  
- 
-  allbranch <- gsub('backbone ', '', gsub('branch: ', '', names(order)))
-  allbranch <- sapply(allbranch, function(i) strsplit(i, ',')[[1]])
-  allbranch <- paste0(names(allbranch), collapse = ' ')
-  newbranch <-sapply(tmpbranch, function(i) {
-      tmp <- paste0(i, collapse = ',')
-      if (!grepl(tmp, allbranch)){
-        rev(i)
-      } else {
-        i
-      }
-  })
-  return(newbranch)
-}
-newbranch <- findbranch(mst = mcl$MSTtree, order = ord, origin = origin.cluster)  
-
-
-# -------------------------------------------------------
-# null distribution of Jaccard index, overlap coefficient
-# -------------------------------------------------------
-## add here --------------->>>>>>
-## for samples
-## add here ---------------<<<<<<<
-js.null <- lapply(seq(1, length(newbranch)), function(i){
-  b.ori <- unlist(sapply(newbranch[[i]], function(c) names(mcl$clusterid[mcl$clusterid == c])))
-  b.ori.alls <- gsub(':.*', '', b.ori)
-  alls <- gsub(':.*', '', rownames(pr))
-  tmp <- mclapply(seq(1, 1e3), function(j){
-    set.seed(j)
-    b.pm <- sample(rownames(pr), length(b.ori))
-    b.pm.alls <- gsub(':.*', '', b.pm)
-    tmpp <- sapply(unique(alls), function(s){
-      b.pm.s <- b.pm[b.pm.alls == s]
-      b.ori.s <- b.ori[b.ori.alls == s]
-      length(intersect(b.pm.s, b.ori.s))/length(union(b.pm.s, b.ori.s))
-    })  
-  },mc.cores = detectCores()-2)
-  tmp <- do.call(rbind,tmp)
-})
-js.cut <- sapply(js.null, function(i) apply(i, 2, quantile, 0.99))
-# ------------------
-
-oc.null <- lapply(seq(1, length(newbranch)), function(i){
-  b.ori <- unlist(sapply(newbranch[[i]], function(c) names(mcl$clusterid[mcl$clusterid == c])))
-  b.ori.alls <- gsub(':.*', '', b.ori)
-  tmp <- mclapply(seq(1, 1e3), function(j){
-    set.seed(j)
-    b.pm <- sample(rownames(pr), length(b.ori))
-    b.pm.alls <- gsub(':.*', '', b.pm)
-    tmpp <- sapply(unique(alls), function(s){
-      b.pm.s <- b.pm[b.pm.alls == s]
-      b.ori.s <- b.ori[b.ori.alls == s]
-      length(intersect(b.pm.s, b.ori.s))/min(length(b.pm.s), length(b.ori.s))
-    })  
-  },mc.cores = detectCores()-2)
-  tmp <- do.call(rbind,tmp)
-})
-oc.cut <- sapply(oc.null, function(i) apply(i, 2, quantile, 0.99))
-
-# -----------
-# permutation 
-# -----------
-corrlist.alls <- jslist.alls <- oclist.alls <- list()
-n.permute = 100
-for (pmid in seq(1, n.permute)){
-  ## boostrap cells
-  print(pmid)
-  set.seed(pmid)
-  pr.pm <- pr[sample(rownames(pr), nrow(pr), replace = TRUE),]
-  pr.pm <- pr.pm[!duplicated(rownames(pr.pm)),]
-  
-  # ## cluster cells
-  clu <- mykmeans(pr.pm, number.cluster = 14)$cluster ###
-
-  pd = data.frame(x = pr[names(clu),1], y = pr[names(clu),2], clu = as.factor(clu))
-  pd.text.x = tapply(pd[,1], list(pd$clu), mean)
-  pd.text.y = tapply(pd[,2], list(pd$clu), mean)
-  pd.text = data.frame(x = pd.text.x, y = pd.text.y, clu = names(pd.text.x))
-  pd.text[14,1:2] =  c(pd.text[14,1] + 2, pd.text[14,2] + 1)
-
-  # ggplot() + 
-  #   geom_scattermore(data = pd, aes(x = x, y = y, color = clu))+
-  #   scale_color_manual(values = mypalette(14))+
-  #   theme_classic() + xlab('UMAP1') + ylab('UMAP2') +
-  #   geom_text(data = pd.text, aes(x = x, y = y, label = clu))
-  
-
-  ## cell type composition in clusters
-  # pd = cbind(pd, celltype = ct[match(rownames(pd), ct[,1]),2])
-  # tab <- table(pd[,3:4])
-  # tab <- tab/rowSums(tab)
-  # pd <- melt(tab)
-  # pd$clu <- factor(as.character(pd$clu), levels = seq(1,max(pd$clu)))
-  # ggplot(data = pd) +
-  #   geom_bar(aes(x = clu, y = value, fill = celltype), stat = 'identity', position = 'dodge') +
-  #   theme_classic() +
-  #   ylab('Celltype Proportion') +
-  #   scale_fill_manual(values = mypalette(length(unique(pd$celltype))))
-
-  # build pseudotime
-  mcl.pm <- exprmclust(t(pr.pm), cluster = clu, reduce = FALSE) ###
-  # plotmclust(mcl.pm, cell_point_size = 0.1)
-  
-  ## select origin cluster
-  pt.pm.mean<- tapply(pt[names(mcl.pm[['clusterid']])], list(mcl.pm[['clusterid']]), mean)
-  start.cluster <- names(which.min(pt.pm.mean))
-  
-  ## construct pseudotime
-  ord.pm <- TSCANorder(mcl.pm, startcluster = start.cluster, listbranch = T,orderonly = T)
-  # str(ord.pm)
-  
-  ## plot pseudotime
-  pt.pm <- unlist(sapply(sapply(ord.pm, length), function(i) seq(1, i)))
-  names(pt.pm) <- unname(unlist(ord.pm))
-  pd = data.frame(pc1 = pca[,1], pc2 = pca[,2], time = as.numeric(pt.pm[rownames(pca)]))
-  # ggplot(data = pd, aes(x = pc1, y = pc2, color = time)) +
-  #   geom_scattermore() + theme_classic()
-  
-  # get candidate branches
-  newbranch.pm <- findbranch(mst = mcl.pm$MSTtree, order = ord.pm, origin = start.cluster)
-  
-  ## compare two MST
-  js <- sapply(seq(1, length(newbranch)), function(i){
-          print('i')
-          print(i)
-          id <- which(sapply(paste0(names(ord),','), function(k) grepl(paste0(paste0(newbranch[[i]], collapse = ','),','), k)))[1]
-          cells <- ord[[id]]
-          b.ori <- intersect(unlist(sapply(newbranch[[i]], function(k) names(mcl$clusterid)[mcl$clusterid == k])), cells)
-          b.ori.alls <- gsub(':.*', '', b.ori)
-          tmp <- mclapply(seq(1, length(newbranch.pm)), function(j){
-            print(j)
-            id <- which(sapply(paste0(names(ord.pm),','), function(k) grepl(paste0(paste0(newbranch.pm[[j]], collapse = ','),','), k)))[1]
-            cells <- ord.pm[[id]]
-            b.pm <- intersect(unlist(sapply(newbranch.pm[[j]], function(k) names(mcl.pm$clusterid)[mcl.pm$clusterid == k])), cells)
-            b.pm.alls <- gsub(':.*', '', b.pm)
-            # js <- length(intersect(b.pm, b.ori))/length(union(b.pm, b.ori))
-            tmpp <- sapply(unique(alls), function(s){
-              b.pm.s <- b.pm[b.pm.alls == s]
-              b.ori.s <- b.ori[b.ori.alls == s]
-              length(intersect(b.pm.s, b.ori.s))/length(union(b.pm.s, b.ori.s))
-            })  
-          },mc.cores = detectCores()-2)
-          tmp <- do.call(rbind,tmp)
-          rownames(tmp) <- paste0('branch.pm', seq(1, length(newbranch.pm)))
-          tmp
-        }, simplify = FALSE)
-  names(js) <- paste0('branch', seq(1, length(newbranch)))
-  ###### =====================================
-  oc <- sapply(seq(1, length(newbranch)), function(i){
-            id <- which(sapply(paste0(names(ord),','), function(k) grepl(paste0(paste0(newbranch[[i]], collapse = ','),','), k)))[1]
-            cells <- ord[[id]]
-            b.ori <- intersect(unlist(sapply(newbranch[[i]], function(k) names(mcl$clusterid)[mcl$clusterid == k])), cells)
-            b.ori.alls <- gsub(':.*', '', b.ori)
-            tmp <- mclapply(seq(1, length(newbranch.pm)), function(j){
-                id <- which(sapply(paste0(names(ord.pm),','), function(k) grepl(paste0(paste0(newbranch.pm[[j]], collapse = ','),','), k)))[1]
-                cells <- ord.pm[[id]]
-                b.pm <- intersect(unlist(sapply(newbranch.pm[[j]], function(k) names(mcl.pm$clusterid)[mcl.pm$clusterid == k])), cells)
-                b.pm.alls <- gsub(':.*', '', b.pm)
-                # oc <- length(intersect(b.pm, b.ori))/min(length(b.pm), length(b.ori))
-                tmpp <- sapply(unique(alls), function(s){
-                  b.pm.s <- b.pm[b.pm.alls == s]
-                  b.ori.s <- b.ori[b.ori.alls == s]
-                  length(intersect(b.pm.s, b.ori.s))/min(length(b.pm.s), length(b.ori.s))
-                })  
-          },mc.cores = detectCores()-2)
-          tmp <- do.call(rbind,tmp)
-          rownames(tmp) <- paste0('branch.pm', seq(1, length(newbranch.pm)))
-          tmp
-        }, simplify = FALSE)
-  names(oc) <- paste0('branch', seq(1, length(newbranch)))           
-        
-  
-  corr <- sapply(seq(1, length(newbranch)), function(i){
-              id <- which(sapply(paste0(names(ord),','), function(k) grepl(paste0(paste0(newbranch[[i]], collapse = ','),','), k)))[1]
-              cells <- ord[[id]]
-              b.ori <- intersect(unlist(sapply(newbranch[[i]], function(k) names(mcl$clusterid)[mcl$clusterid == k])), cells)
-              b.ori.alls <- gsub(':.*', '', b.ori)
-              tmp <- mclapply(seq(1, length(newbranch.pm)), function(j){
-                  id <- which(sapply(paste0(names(ord.pm),','), function(k) grepl(paste0(paste0(newbranch.pm[[j]], collapse = ','),','), k)))[1]
-                  cells <- ord.pm[[id]]
-                  b.pm <- intersect(unlist(sapply(newbranch.pm[[j]], function(k) names(mcl.pm$clusterid)[mcl.pm$clusterid == k])), cells)
-                  b.pm.alls <- gsub(':.*', '', b.pm)
-                  # ov = intersect(b.ori, b.pm)
-                  # cor(pt[ov], pt.pm[ov])
-                  tmpp <- sapply(unique(alls), function(s){
-                    b.pm.s <- b.pm[b.pm.alls == s]
-                    b.ori.s <- b.ori[b.ori.alls == s] 
-                    ov = intersect(b.ori.s, b.pm.s)
-                    cor(pt[ov], pt.pm[ov])
-                  })
-              }, mc.cores = detectCores()-2) 
-              tmp <- do.call(rbind, tmp)
-              rownames(tmp) <- paste0('branch.pm', seq(1, length(newbranch.pm)))
-              tmp[is.na(tmp)] <- 0
-              tmp
-          }, simplify = FALSE)
-  # corr[is.na(corr)] <- 0
-  names(corr) <- paste0('branch', seq(1, length(newbranch)))           
-  # colnames(corr) <- colnames(oc) <- colnames(js) <- paste0('original', seq(1, length(newbranch)))
-  jslist.alls[[pmid]] <- js
-  oclist.alls[[pmid]] <- oc
-  corrlist.alls[[pmid]] <- corr
-}
-saveRDS(jslist.alls, '/Users/wenpinhou/Dropbox/trajectory_variability/tree_variability/result/samples/pm_js_alls.rds')   
-saveRDS(oclist.alls, '/Users/wenpinhou/Dropbox/trajectory_variability/tree_variability/result/samples/pm_oc_alls.rds')   
-saveRDS(corrlist.alls, '/Users/wenpinhou/Dropbox/trajectory_variability/tree_variability/samples/result/pm_oc_alls.rds')   
-
-# jsm <- do.call(rbind, jslist)
-# ocm <- do.call(rbind, oclist)
-# par(mfrow = c(1,2))
-# hist(jsm)
-# hist(ocm)
-s = unique(alls)[1]
-df.alls <- lapply(unique(alls), function(s){
-  jslist = sapply(jslist.alls, function(i){
-    sapply(i, function(ii) ii[,s])
-  }, simplify = FALSE)
-  oclist = sapply(oclist.alls, function(i){
-    sapply(i, function(ii) ii[,s])
-  }, simplify = FALSE)
-  corrlist = sapply(corrlist.alls, function(i){
-    sapply(i, function(ii) ii[,s])
-  }, simplify = FALSE)
-  
-  res <- corr.score <- list()
-  for (i in seq(1, length(jslist))){
-    print(i)
-    js <- jslist[[i]]
-    js.binary <- sapply(seq(1,ncol(js)), function(c){
-      (js[,c] > js.cut[c]) + 0
-    })
-    while (length(which(rowSums(js.binary) > 1)) > 0 | length(which(colSums(js.binary) > 1)) > 0){
-      dup.id <- which(rowSums(js.binary) > 1)
-      if (length(dup.id) == 1){
-        addid <- which.max(js[dup.id, ])
-        js.binary[dup.id, ] <- 0
-        js.binary[dup.id, addid] <- 1  
-      } else if (length(dup.id) > 1) {
-        for (dup.i in dup.id){
-          print(dup.i)
-          addid <- which.max(js[dup.i, ])
-          js.binary[dup.i, ] <- 0
-          js.binary[dup.i, addid] <- 1  
-        }
-      }
-        
-      dup.id <- which(colSums(js.binary) > 1)
-      if (length(dup.id) == 1){
-        addid <- which.max(js[, dup.id])
-        js.binary[, dup.id] <- 0
-        js.binary[addid, dup.id] <- 1  
-      } else if (length(dup.id) > 1) {
-        for (dup.i in dup.id){
-          addid <- which.max(js[, dup.i])
-          js.binary[, dup.i] <- 0
-          js.binary[addid, dup.i] <- 1  
-        }
-      }
-    }
-    
-    corr.score[[i]] <- corrlist[[i]] * js.binary
-    js.melt <- melt(js.binary)
-    js.melt <- js.melt[js.melt[,3]!=0,]
-    res[[i]] <- as.character(js.melt[,2])
-  }
-  res <- unlist(res)  
-  js.perc <- table(res)/n.permute
-  names(js.perc) <- newbranch
-  # saveRDS(js.perc, '/Users/wenpinhou/Dropbox/trajectory_variability/tree_variability/result/samples/js_percentage.rds')
-  
-  corr.score.m <- do.call(rbind, corr.score)
-  corr.score.v <- colSums(corr.score.m)/n.permute
-  names(corr.score.v) <- newbranch
-  # saveRDS(corr.score.v, '/Users/wenpinhou/Dropbox/trajectory_variability/tree_variability/result/samples/corr_score.rds')
-  
-  res <- sapply(seq(1,length(oclist)), function(i){
-    print(i)
-    oc <- oclist[[i]]
-    oc.binary <- sapply(seq(1,ncol(oc)), function(c){
-      (oc[,c] > oc.cut[c]) + 0
-    })
-    while (length(which(rowSums(oc.binary) > 1)) > 0 | length(which(colSums(oc.binary) > 1)) > 0){
-      dup.id <- which(rowSums(oc.binary) > 1)
-      if (length(dup.id) == 1){
-        addid <- which.max(oc[dup.id, ])
-        oc.binary[dup.id, ] <- 0
-        oc.binary[dup.id, addid] <- 1  
-      } else if (length(dup.id) > 1) {
-        for (dup.i in dup.id){
-          addid <- which.max(oc[dup.i, ])
-          oc.binary[dup.i, ] <- 0
-          oc.binary[dup.i, addid] <- 1  
-        }
-      }
-      dup.id <- which(colSums(oc.binary) > 1)
-      if (length(dup.id) == 1){
-        addid <- which.max(oc[, dup.id])
-        oc.binary[, dup.id] <- 0
-        oc.binary[addid, dup.id] <- 1  
-      } else if (length(dup.id) > 1) {
-        for (dup.i in dup.id){
-          addid <- which.max(oc[, dup.i])
-          oc.binary[, dup.i] <- 0
-          oc.binary[addid, dup.i] <- 1  
-        }
-      }
-    }
-    oc.melt <- melt(oc.binary)
-    oc.melt <- oc.melt[oc.melt[,3]!=0,]
-    as.character(oc.melt[,2])
-  })
-  res <- unlist(res)  
-  oc.perc <- table(res)/n.permute
-  names(oc.perc) <- newbranch
-  sort((js.perc + oc.perc)/2)
-  
-  df <- data.frame(js.perc = js.perc, oc.perc = oc.perc, corr.score.v = corr.score.v)
-  df <- df[, c(2,4,5)]
-
-# saveRDS(oc.perc, '/Users/wenpinhou/Dropbox/trajectory_variability/tree_variability/result/samples/oc_percentage.rds')
-  
-})
-names(df.alls) <- unique(alls)
-df.alls[order(names(df.alls))]
-
-
-
diff --git a/hca_bone_marrow_data_analysis/tree_variability/auto_pc_auto_nclu_module_3traj/code/03_try_to_build_module.R b/hca_bone_marrow_data_analysis/tree_variability/auto_pc_auto_nclu_module_3traj/code/03_try_to_build_module.R
deleted file mode 100644
index c822909..0000000
--- a/hca_bone_marrow_data_analysis/tree_variability/auto_pc_auto_nclu_module_3traj/code/03_try_to_build_module.R
+++ /dev/null
@@ -1,443 +0,0 @@
-rm(list=ls())
-library(ggplot2)
-library(Seurat)
-library(reshape2)
-library(TSCAN)
-library(scattermore)
-library(RColorBrewer)
-suppressMessages(library(igraph))
-n.permute <- 3
-max.clunum <- 50
-setwd("/Users/wenpinhou/Dropbox/trajectory_variability/hca/data/HCA/proc/integrate")
-
-# --------------------------------------------------------------
-# input: seurat integrated object including:
-#  umap, pca
-# celltype: a dataframe, col 1 is cell name, col 2 is cell type
-# origin: the origin cell type
-# --------------------------------------------------------------
-# read in data
-umap = readRDS('umap.rds')
-pca <- as.matrix(umap@reductions$pca@cell.embeddings)
-# ctlevel <- data.frame(ct=c('HSC','MPP','LMPP','CMP','CLP','GMP','MEP',"Bcell","CD4Tcell","CD8Tcell",'NKcell','Mono','Ery'),level=c(1,2,3,3,4,4,4,5,5,5,5,5,5),immunepath=c(1,1,1,0,1,0,0,1,1,1,1,0,0),monopath=c(1,1,1,1,0,1,0,0,0,0,0,1,0),erypath=c(1,1,0,1,0,0,1,0,0,0,0,0,1),stringsAsFactors = F)
-str(pca)
-a = readRDS('/Users/wenpinhou/Dropbox/trajectory_variability/hca/data/HCA/proc/ct/sc.rds')
-ct = data.frame(cell = names(a), celltype = a, stringsAsFactors = FALSE)
-alls <- sub(':.*', '', names(a))
-names(alls) <- names(a)
-  
-mykmeans <- function(matrix, number.cluster = NA){
-  ## cluster the rows
-  set.seed(12345)
-  library(parallel)
-  if (is.na(number.cluster)){
-    maxclunum <- 20
-    rss <- mclapply(1:maxclunum,function(clunum) {
-      tmp <- kmeans(matrix,clunum,iter.max = 1000)
-      tmp$betweenss/tmp$totss
-    },mc.cores=20)
-    rss <- unlist(rss)
-    x <- 1:maxclunum
-    optclunum <- which.min(sapply(1:maxclunum, function(i) {
-        x2 <- pmax(0, x - i)
-        sum(lm(rss ~ x + x2)$residuals^2)  ## check this
-    }))
-    clu <- kmeans(matrix,optclunum)
-  } else {
-    clu <- kmeans(matrix, number.cluster)    
-  }
-    return(clu)
-}
-
-### determine numPC
-set.seed(12345)
-sdev <- apply(pca, 2, sd)
-x <- 1:max.clunum
-optpoint <- which.min(sapply(2:max.clunum, function(i) {
-  x2 <- pmax(0, x - i)
-  sum(lm(sdev[1:max.clunum] ~ x + x2)$residuals^2)
-}))
-pcadim = optpoint + 1
-pr <- pca[,1:pcadim]  # 7
-
-## clustering
-clu <- mykmeans(pr, number.cluster = 14)$cluster
-# pd = data.frame(x = pr[,1], y = pr[,2], clu = as.factor(clu[rownames(pr)]))
-# mypalette = colorRampPalette(brewer.pal(9,'Set1'))
-# ggplot(data = pd, aes(x = x, y = y, color = clu)) + 
-#   geom_scattermore()+
-#   scale_color_manual(values = mypalette(14))+
-#   theme_classic() + xlab('UMAP1') + ylab('UMAP2')
-
-# ## cell type composition in clusters
-# pd = cbind(pd, celltype = ct[match(rownames(pd), ct[,1]),2])
-# tab <- table(pd[,3:4])
-# tab <- tab/rowSums(tab)
-# pd <- melt(tab)
-# pd$clu <- factor(as.character(pd$clu), levels = seq(1,max(pd$clu)))
-# 
-# ggplot(data = pd) +
-#   geom_bar(aes(x = clu, y = value, fill = celltype), stat = 'identity', position = 'dodge') +
-#   theme_classic() +
-#   ylab('Celltype Proportion') +
-#   scale_fill_manual(values = mypalette(length(unique(pd$celltype))))
-
-### mclust
-mcl <- exprmclust(t(pr),cluster=clu,reduce=F)
-# mcl <- exprmclust(t(pr), reduce = F)
-# plotmclust(mcl, cell_point_size = 0.1)
-# str(mcl)
-
-# --------------------
-# construct pseudotime 
-# --------------------
-## find origin
-pd = data.frame(x = pr[,1], y = pr[,2], clu = as.factor(mcl$clusterid))
-pd = cbind(pd, celltype = ct[match(rownames(pd), ct[,1]),2])
-tab <- table(pd[,3:4])
-tab <- tab/rowSums(tab)
-pd <- melt(tab)
-pd$clu <- factor(as.character(pd$clu), levels = seq(1,max(pd$clu)))
-tmp <- pd[pd$celltype == 'HSC', ]
-origin.cluster <- as.numeric(tmp[which.max(tmp[,3]), 1])
-
-## construct pseudotime
-ord <- TSCANorder(mcl, startcluster = origin.cluster, listbranch = T,orderonly = T)
-str(ord)
-length(ord)
-pt <- unlist(sapply(sapply(ord, length), function(i) seq(1, i)))
-names(pt) <- unname(unlist(ord))
-
-# ## plot pseudotime
-# pd = data.frame(pc1 = pca[,1], pc2 = pca[,2], time = as.numeric(pt[rownames(pca)]))
-# library(scattermore)
-# library(RColorBrewer)
-# ggplot(data = pd, aes(x = pc1, y = pc2, color = time)) +
-#   geom_scattermore() +
-#   scale_color_gradient(low = 'yellow', high = 'blue')
-
-# ------------------------------------------------------------
-# get candidate branches to test reproducibility, 20200726 >>
-# ------------------------------------------------------------
-findbranch <- function(mst, order, origin){
-  deg <- degree(mst)
-  vertex <- names(deg[which(deg > 2 | deg == 1)])
-  if (!origin %in% vertex) vertex <- c(origin, vertex)
-  eg <- expand.grid(1:length(vertex), 1:length(vertex))
-  eg <- eg[eg[,1]<eg[,2],]
-  eg = data.frame(vertex[eg[,1]], vertex[eg[,2]], stringsAsFactors = FALSE)
-  library(igraph)
-  tmpbranch <- lapply(seq(1,nrow(eg)), function(i){
-    sp <- shortest_paths(mst, from = eg[i,1], to = eg[i,2])$vpath[[1]]
-    if (sum(vertex %in% sp) == 2) as.vector(sp)
-  })
-  tmpbranch <- tmpbranch[sapply(tmpbranch, length) >0]  
- 
-  allbranch <- gsub('backbone ', '', gsub('branch: ', '', names(order)))
-  allbranch <- sapply(allbranch, function(i) strsplit(i, ',')[[1]])
-  allbranch <- paste0(names(allbranch), collapse = ' ')
-  newbranch <-sapply(tmpbranch, function(i) {
-      tmp <- paste0(i, collapse = ',')
-      if (!grepl(tmp, allbranch)){
-        rev(i)
-      } else {
-        i
-      }
-  })
-  return(newbranch)
-}
-newbranch <- findbranch(mst = mcl$MSTtree, order = ord, origin = origin.cluster)  
-
-# -----------------------------------------------------
-# Evaluate robustness of tree branches using resampling
-# -----------------------------------------------------
-
-# null distribution of Jaccard index, overlap coefficient
-
-js.null <- lapply(seq(1, length(newbranch)), function(i) {
-  b.ori <- unlist(sapply(newbranch[[i]], function(c) names(mcl$clusterid[mcl$clusterid == c])))
-  tmp <- sapply(seq(1, 1e3), function(j){
-    set.seed(j)
-    b.pm <- sample(rownames(pr), length(b.ori))
-    length(intersect(b.pm, b.ori))/length(union(b.pm, b.ori))
-  })
-})
-
-# par(mfrow = c(2,ceiling(length(js.null)/2)))
-# for (i in js.null) hist(i, xlab = 'js', main = '', breaks = 50)
-
-js.cut <- sapply(js.null, quantile, 0.99)
-
-oc.null <- lapply(seq(1, length(newbranch)), function(i){
-  b.ori <- unlist(sapply(newbranch[[i]], function(c) names(mcl$clusterid[mcl$clusterid == c])))
-  tmp <- sapply(seq(1, 1e3), function(j){
-    set.seed(j)
-    b.pm <- sample(rownames(pr), length(b.ori))
-    length(intersect(b.pm, b.ori))/min(length(b.pm), length(b.ori))
-  })
-})
-# par(mfrow = c(2,ceiling(length(oc.null)/2)))
-# for (i in oc.null) hist(i, xlab = 'oc', main = '', breaks = 50)
-oc.cut <- sapply(oc.null, quantile, 0.99)
-
-# permutation 
-
-get_binary <- function(js){
-  js.binary <- sapply(seq(1,ncol(js)), function(c){
-    (js[,c] > js.cut[c]) + 0
-  })
-  while (length(which(rowSums(js.binary) > 1)) > 0 | length(which(colSums(js.binary) > 1)) > 0){
-    dup.id <- which(rowSums(js.binary) > 1)
-    if (length(dup.id) == 1){
-      addid <- which.max(js[dup.id, ])
-      js.binary[dup.id, ] <- 0
-      js.binary[dup.id, addid] <- 1  
-    } else if (length(dup.id) > 1) {
-      for (dup.i in dup.id){
-        addid <- which.max(js[dup.i, ])
-        js.binary[dup.i, ] <- 0
-        js.binary[dup.i, addid] <- 1  
-      }
-    }
-      
-    dup.id <- which(colSums(js.binary) > 1)
-    if (length(dup.id) == 1){
-      addid <- which.max(js[, dup.id])
-      js.binary[, dup.id] <- 0
-      js.binary[addid, dup.id] <- 1  
-    } else if (length(dup.id) > 1) {
-      for (dup.i in dup.id){
-        addid <- which.max(js[, dup.i])
-        js.binary[, dup.i] <- 0
-        js.binary[addid, dup.i] <- 1  
-      }
-    }
-  }
-  return(js.binary)
-}
-
-ctcomplist <- reproduce <- corr.score <- corrlist <- jslist <- oclist <- list()
-
-for (pmid in seq(1, n.permute)){
-  ## boostrap cells
-  print(pmid)
-  set.seed(pmid)
-  pr.pm <- pr[sample(rownames(pr), nrow(pr), replace = TRUE),]
-  pr.pm <- pr.pm[!duplicated(rownames(pr.pm)),]
-  
-  ## cluster cells
-  clu <- mykmeans(pr.pm, number.cluster = 14)$cluster ###
-
-  # --- check if these codes are necessary <<<<<<<<<<<<<<<<
-  pd = data.frame(x = pr[names(clu),1], y = pr[names(clu),2], clu = as.factor(clu))
-  pd.text.x = tapply(pd[,1], list(pd$clu), mean)
-  pd.text.y = tapply(pd[,2], list(pd$clu), mean)
-  pd.text = data.frame(x = pd.text.x, y = pd.text.y, clu = names(pd.text.x))
-  pd.text[14,1:2] =  c(pd.text[14,1] + 2, pd.text[14,2] + 1)
-  # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>
-  # ggplot() + 
-  #   geom_scattermore(data = pd, aes(x = x, y = y, color = clu))+
-  #   scale_color_manual(values = mypalette(14))+
-  #   theme_classic() + xlab('UMAP1') + ylab('UMAP2') +
-  #   geom_text(data = pd.text, aes(x = x, y = y, label = clu))
-  
-
-  ## cell type composition in clusters
-  # pd = cbind(pd, celltype = ct[match(rownames(pd), ct[,1]),2])
-  # tab <- table(pd[,3:4])
-  # tab <- tab/rowSums(tab)
-  # pd <- melt(tab)
-  # pd$clu <- factor(as.character(pd$clu), levels = seq(1,max(pd$clu)))
-  # ggplot(data = pd) +
-  #   geom_bar(aes(x = clu, y = value, fill = celltype), stat = 'identity', position = 'dodge') +
-  #   theme_classic() +
-  #   ylab('Celltype Proportion') +
-  #   scale_fill_manual(values = mypalette(length(unique(pd$celltype))))
-
-  ## build pseudotime
-  mcl.pm <- exprmclust(t(pr.pm), cluster = clu, reduce = FALSE) ###
-  # plotmclust(mcl.pm, cell_point_size = 0.1)
-  
-  ## select origin cluster
-  pt.pm.mean<- tapply(pt[names(mcl.pm[['clusterid']])], list(mcl.pm[['clusterid']]), mean)
-  start.cluster <- names(which.min(pt.pm.mean))
-  
-  ## construct pseudotime
-  ord.pm <- TSCANorder(mcl.pm, startcluster = start.cluster, listbranch = T,orderonly = T)
-  # str(ord.pm)
-  
-  pt.pm <- unlist(sapply(sapply(ord.pm, length), function(i) seq(1, i)))
-  names(pt.pm) <- unname(unlist(ord.pm))
-  # --- check if these codes are necessary <<<<<<<<<<<<<<<<
-  ## plot pseudotime
-  
-  pd = data.frame(pc1 = pca[,1], pc2 = pca[,2], time = as.numeric(pt.pm[rownames(pca)]))
-  # ggplot(data = pd, aes(x = pc1, y = pc2, color = time)) +
-  #   geom_scattermore() + theme_classic()
-  # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>
-  
-  # get candidate branches
-  newbranch.pm <- findbranch(mst = mcl.pm$MSTtree, order = ord.pm, origin = start.cluster)
-  
-  ## compare two MST
-  js <- sapply(seq(1, length(newbranch)), function(i){
-          id <- which(sapply(paste0(names(ord),','), function(k) grepl(paste0(paste0(newbranch[[i]], collapse = ','),','), k)))[1]
-          cells <- ord[[id]]
-          b.ori <- intersect(unlist(sapply(newbranch[[i]], function(k) names(mcl$clusterid)[mcl$clusterid == k])), cells)
-          sapply(seq(1, length(newbranch.pm)), function(j){
-            print(j)
-            id <- which(sapply(paste0(names(ord.pm),','), function(k) grepl(paste0(paste0(newbranch.pm[[j]], collapse = ','),','), k)))[1]
-            cells <- ord.pm[[id]]
-            b.pm <- intersect(unlist(sapply(newbranch.pm[[j]], function(k) names(mcl.pm$clusterid)[mcl.pm$clusterid == k])), cells)
-            js <- length(intersect(b.pm, b.ori))/length(union(b.pm, b.ori))
-          })
-        })
-  oc <- sapply(seq(1, length(newbranch)), function(i){
-            id <- which(sapply(paste0(names(ord),','), function(k) grepl(paste0(paste0(newbranch[[i]], collapse = ','),','), k)))[1]
-            cells <- ord[[id]]
-            b.ori <- intersect(unlist(sapply(newbranch[[i]], function(k) names(mcl$clusterid)[mcl$clusterid == k])), cells)
-            sapply(seq(1, length(newbranch.pm)), function(j){
-                id <- which(sapply(paste0(names(ord.pm),','), function(k) grepl(paste0(paste0(newbranch.pm[[j]], collapse = ','),','), k)))[1]
-                cells <- ord.pm[[id]]
-                b.pm <- intersect(unlist(sapply(newbranch.pm[[j]], function(k) names(mcl.pm$clusterid)[mcl.pm$clusterid == k])), cells)
-                oc <- length(intersect(b.pm, b.ori))/min(length(b.pm), length(b.ori))
-           }) 
-        })
-  corr <- sapply(seq(1, length(newbranch)), function(i){
-              id <- which(sapply(paste0(names(ord),','), function(k) grepl(paste0(paste0(newbranch[[i]], collapse = ','),','), k)))[1]
-              cells <- ord[[id]]
-              b.ori <- intersect(unlist(sapply(newbranch[[i]], function(k) names(mcl$clusterid)[mcl$clusterid == k])), cells)
-              
-              sapply(seq(1, length(newbranch.pm)), function(j){
-                  id <- which(sapply(paste0(names(ord.pm),','), function(k) grepl(paste0(paste0(newbranch.pm[[j]], collapse = ','),','), k)))[1]
-                  cells <- ord.pm[[id]]
-                  b.pm <- intersect(unlist(sapply(newbranch.pm[[j]], function(k) names(mcl.pm$clusterid)[mcl.pm$clusterid == k])), cells)
-                  ov = intersect(b.ori, b.pm)
-                  cor(pt[ov], pt.pm[ov])
-              }) 
-          })
-  corr[is.na(corr)] <- 0
-  colnames(corr) <- colnames(oc) <- colnames(js) <- paste0('original', seq(1, length(newbranch)))
-  jslist[[pmid]] <- js
-  oclist[[pmid]] <- oc
-  corrlist[[pmid]] <- corr
-  
-  ## get js binary to matched branches <<<<<<<<<<<<<<< 
-  js.binary <- get_binary(js)
-  corr.score[[pmid]] <- corr * js.binary
-  js.melt <- melt(js.binary)
-  js.melt <- js.melt[js.melt[,3]!=0,]
-  colnames(js.melt) <- c('permutation.branch','original.branch','matched')
-  reproduce[[pmid]] <- as.character(js.melt[,2])
-  ## >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>
-  ## 
-  tmp <- js.melt[1,2]
-  ctcomp <- sapply(js.melt[,2], function(tmp){
-    c <- names(clu)[clu %in% newbranch.pm[[tmp]]]
-    ctcomp <- rep(0, length(unique(alls)))
-    names(ctcomp) <- unique(alls)
-    ctcomp[names(table(alls[c]))] <- table(alls[c])
-  })
-  colnames(ctcomp) <- paste0('origin', js.melt[,2])
-  ctcomp <- ctcomp/rowSums(ctcomp)
-  
-  
-  ctcomp.new <- matrix(0, nrow = length(unique(alls)), ncol = length(newbranch))
-  colnames(ctcomp.new) <- paste0('origin', seq(1, length(newbranch)))
-  rownames(ctcomp.new) <- unique(alls)
-  ctcomp.new[rownames(ctcomp), colnames(ctcomp)] <- ctcomp
-  ctcomplist[[pmid]] <- t(ctcomp.new)
-  
-}
-
-# saveRDS(jslist, '/Users/wenpinhou/Dropbox/trajectory_variability/tree_variability/result/pm_js.rds')   
-# saveRDS(oclist, '/Users/wenpinhou/Dropbox/trajectory_variability/tree_variability/result/pm_oc.rds')   
-# 
-# saveRDS(corrlist, '/Users/wenpinhou/Dropbox/trajectory_variability/tree_variability/result/pm_oc.rds')   
-
-jsm <- do.call(rbind, jslist)
-ocm <- do.call(rbind, oclist)
-# par(mfrow = c(1,2))
-# hist(jsm)
-# hist(ocm)
-
-## moved within boostrap
-# reproduce <- corr.score <- list()
-# for (i in seq(1, length(jslist))){
-#   print(i)
-#   js <- jslist[[i]]
-#   js.binary <- get_binary(js)
-#   corr.score[[i]] <- corrlist[[i]] * js.binary
-#   js.melt <- melt(js.binary)
-#   js.melt <- js.melt[js.melt[,3]!=0,]
-#   colnames(js.melt) <- c('permutation.branch','original.branch','matched')
-#   reproduce[[i]] <- as.character(js.melt[,2])
-# }
-
-reproduce <- unlist(reproduce)  
-
-
-js.perc <- rep(0, length(newbranch))
-js.perc[as.numeric(names(table(reproduce)))] <-  table(reproduce)/n.permute
-names(js.perc) <- newbranch
-# saveRDS(js.perc, '/Users/wenpinhou/Dropbox/trajectory_variability/tree_variability/result/js_percentage.rds')
-
-corr.score.m <- do.call(rbind, corr.score)
-corr.score.v <- colSums(corr.score.m)/n.permute
-names(corr.score.v) <- newbranch
-# saveRDS(corr.score.v, '/Users/wenpinhou/Dropbox/trajectory_variability/tree_variability/result/corr_score.rds')
-
-res <- sapply(seq(1,length(oclist)), function(i){
-  print(i)
-  oc <- oclist[[i]]
-  oc.binary <- sapply(seq(1,ncol(oc)), function(c){
-    (oc[,c] > oc.cut[c]) + 0
-  })
-  while (length(which(rowSums(oc.binary) > 1)) > 0 | length(which(colSums(oc.binary) > 1)) > 0){
-    dup.id <- which(rowSums(oc.binary) > 1)
-    if (length(dup.id) == 1){
-      addid <- which.max(oc[dup.id, ])
-      oc.binary[dup.id, ] <- 0
-      oc.binary[dup.id, addid] <- 1  
-    } else if (length(dup.id) > 1) {
-      for (dup.i in dup.id){
-        addid <- which.max(oc[dup.i, ])
-        oc.binary[dup.i, ] <- 0
-        oc.binary[dup.i, addid] <- 1  
-      }
-    }
-    dup.id <- which(colSums(oc.binary) > 1)
-    if (length(dup.id) == 1){
-      addid <- which.max(oc[, dup.id])
-      oc.binary[, dup.id] <- 0
-      oc.binary[addid, dup.id] <- 1  
-    } else if (length(dup.id) > 1) {
-      for (dup.i in dup.id){
-        addid <- which.max(oc[, dup.i])
-        oc.binary[, dup.i] <- 0
-        oc.binary[addid, dup.i] <- 1  
-      }
-    }
-  }
-  oc.melt <- melt(oc.binary)
-  oc.melt <- oc.melt[oc.melt[,3]!=0,]
-  as.character(oc.melt[,2])
-})
-res <- unlist(res)  
-oc.perc <- rep(0, length(newbranch))
-oc.perc[as.numeric(names(table(res)))] <-  table(res)/n.permute
-names(oc.perc) <- newbranch
-sort((js.perc + oc.perc)/2)
-
-# saveRDS(oc.perc, '/Users/wenpinhou/Dropbox/trajectory_variability/tree_variability/result/oc_percentage.rds')
-
-detection.rate <- data.frame(detection.rate = (js.perc + oc.perc[names(js.perc)])/2, stringsAsFactors = FALSE)
-sample.cellcomp.mean <- apply(simplify2array(ctcomplist), 1:2, mean)
-sample.cellcomp.sd <- apply(simplify2array(ctcomplist), 1:2, sd)
-rownames(sample.cellcomp.mean) <- newbranch[as.numeric(sub('origin', '', rownames(sample.cellcomp.mean)))]
-rownames(sample.cellcomp.sd) <- newbranch[as.numeric(sub('origin', '', rownames(sample.cellcomp.sd)))]
-
-result <- list(detection.rate = detection.rate, 
-               sample.cellcomp.mean = sample.cellcomp.mean, 
-               sample.cellcomp.sd = sample.cellcomp.sd)
-
diff --git a/hca_bone_marrow_data_analysis/tree_variability/auto_pc_auto_nclu_module_3traj/code/04_try_to_build_module_v2.R b/hca_bone_marrow_data_analysis/tree_variability/auto_pc_auto_nclu_module_3traj/code/04_try_to_build_module_v2.R
deleted file mode 100644
index 43d7f22..0000000
--- a/hca_bone_marrow_data_analysis/tree_variability/auto_pc_auto_nclu_module_3traj/code/04_try_to_build_module_v2.R
+++ /dev/null
@@ -1,409 +0,0 @@
-rm(list=ls())
-library(ggplot2)
-library(Seurat)
-library(reshape2)
-library(TSCAN)
-library(scattermore)
-library(RColorBrewer)
-suppressMessages(library(igraph))
-n.permute <- 3
-max.clunum <- 50
-# setwd("/Users/wenpinhou/Dropbox/trajectory_variability/hca/data/HCA/proc/integrate")
-setwd("/Users/wenpinhou/Dropbox/trajectory_variability")
-plotdir <- 'tree_variability/plot/'
-# --------------------------------------------------------------
-# input: seurat integrated object including:
-#  umap, pca
-# celltype: a dataframe, col 1 is cell name, col 2 is cell type
-# origin: the origin cell type
-# --------------------------------------------------------------
-# read in data
-umap = readRDS('hca/data/HCA/proc/integrate/umap.rds')
-pca <- as.matrix(umap@reductions$pca@cell.embeddings)
-# ctlevel <- data.frame(ct=c('HSC','MPP','LMPP','CMP','CLP','GMP','MEP',"Bcell","CD4Tcell","CD8Tcell",'NKcell','Mono','Ery'),level=c(1,2,3,3,4,4,4,5,5,5,5,5,5),immunepath=c(1,1,1,0,1,0,0,1,1,1,1,0,0),monopath=c(1,1,1,1,0,1,0,0,0,0,0,1,0),erypath=c(1,1,0,1,0,0,1,0,0,0,0,0,1),stringsAsFactors = F)
-str(pca)
-a = readRDS('/Users/wenpinhou/Dropbox/trajectory_variability/hca/data/HCA/proc/ct/sc.rds')
-ct = data.frame(cell = names(a), celltype = a, stringsAsFactors = FALSE)
-  
-mykmeans <- function(matrix, number.cluster = NA, maxclunum = 20, seed = 12345){
-  ## cluster the rows
-  set.seed(seed)
-  library(parallel)
-  if (is.na(number.cluster)){
-    rss <- mclapply(1:maxclunum,function(clunum) {
-      set.seed(12345)
-      tmp <- kmeans(matrix,clunum,iter.max = 1000)
-      tmp$betweenss/tmp$totss
-    },mc.cores=20)
-    rss <- unlist(rss)
-    x <- 1:maxclunum
-    optclunum <- which.min(sapply(1:maxclunum, function(i) {
-        x2 <- pmax(0, x - i)
-        sum(lm(rss ~ x + x2)$residuals^2)  ## check this
-    }))
-    print(optclunum)
-    clu <- kmeans(matrix,optclunum)
-  } else {
-    clu <- kmeans(matrix, number.cluster)    
-  }
-    return(clu)
-}
-findbranch <- function(mst, order, origin){
-  deg <- degree(mst)
-  vertex <- names(deg[which(deg > 2 | deg == 1)])
-  if (!origin %in% vertex) vertex <- c(origin, vertex)
-  eg <- expand.grid(1:length(vertex), 1:length(vertex))
-  eg <- eg[eg[,1]<eg[,2],]
-  eg = data.frame(vertex[eg[,1]], vertex[eg[,2]], stringsAsFactors = FALSE)
-  library(igraph)
-  tmpbranch <- lapply(seq(1,nrow(eg)), function(i){
-    sp <- shortest_paths(mst, from = eg[i,1], to = eg[i,2])$vpath[[1]]
-    if (sum(vertex %in% sp) == 2) as.vector(sp)
-  })
-  tmpbranch <- tmpbranch[sapply(tmpbranch, length) >0]  
- 
-  allbranch <- gsub('backbone ', '', gsub('branch: ', '', names(order)))
-  allbranch <- sapply(allbranch, function(i) strsplit(i, ',')[[1]])
-  allbranch <- paste0(names(allbranch), collapse = ' ')
-  newbranch <-sapply(tmpbranch, function(i) {
-      tmp <- paste0(i, collapse = ',')
-      if (!grepl(tmp, allbranch)){
-        rev(i)
-      } else {
-        i
-      }
-  })
-  return(newbranch)
-}
-get_binary <- function(matrix, matrix.cut){
-  ## match boostrap and origin branches.
-  ## matrix: #boostrap.branch * #origin.branch, values are js or oc
-  ## matrix.cut: js or oc null distribution cutoff 
-  matrix.binary <- sapply(seq(1,ncol(matrix)), function(c){
-    (matrix[,c] > matrix.cut[c]) + 0
-  })
-  while (length(which(rowSums(matrix.binary) > 1)) > 0 | length(which(colSums(matrix.binary) > 1)) > 0){
-    dup.id <- which(rowSums(matrix.binary) > 1)
-    if (length(dup.id) == 1){
-      addid <- which.max(matrix[dup.id, ])
-      matrix.binary[dup.id, ] <- 0
-      matrix.binary[dup.id, addid] <- 1  
-    } else if (length(dup.id) > 1) {
-      for (dup.i in dup.id){
-        addid <- which.max(matrix[dup.i, ])
-        matrix.binary[dup.i, ] <- 0
-        matrix.binary[dup.i, addid] <- 1  
-      }
-    }
-      
-    dup.id <- which(colSums(matrix.binary) > 1)
-    if (length(dup.id) == 1){
-      addid <- which.max(matrix[, dup.id])
-      matrix.binary[, dup.id] <- 0
-      matrix.binary[addid, dup.id] <- 1  
-    } else if (length(dup.id) > 1) {
-      for (dup.i in dup.id){
-        addid <- which.max(matrix[, dup.i])
-        matrix.binary[, dup.i] <- 0
-        matrix.binary[addid, dup.i] <- 1  
-      }
-    }
-  }
-  return(matrix.binary)
-}
-infer_tree_structure <- function(pca, ct, origin.celltype, number.cluster = NA, plotdir = getwd()){
-  alls <- sub(':.*', '', ct$cell)
-  names(alls) <- ct$cell
-  set.seed(12345)
-  sdev <- apply(pca, 2, sd)
-  x <- 1:max.clunum
-  optpoint <- which.min(sapply(2:max.clunum, function(i) {
-    x2 <- pmax(0, x - i)
-    sum(lm(sdev[1:max.clunum] ~ x + x2)$residuals^2)
-  }))
-  pcadim = optpoint + 1
-  pr <- pca[,1:pcadim]  # 7
-  
-  ## clustering
-  # clu <- mykmeans(pr, number.cluster = number.cluster, maxclunum = 50, seed = i)$cluster
-  clu <- mykmeans(pr, maxclunum = 50, number.cluster = number.cluster)$cluster
-  table(clu)
-  pd = data.frame(x = pr[,1], y = pr[,2], clu = as.factor(clu[rownames(pr)]))
-  mypalette = colorRampPalette(brewer.pal(9,'Set1'))
-  pdf(paste0(plotdir, 'cluster.pdf'), width = 5, height = 4)
-  print(ggplot(data = pd, aes(x = x, y = y, color = clu)) +
-    geom_scattermore()+
-    scale_color_manual(values = mypalette(14))+
-    theme_classic() + xlab('PC1') + ylab('PC2'))
-  dev.off()
-  ## cell type composition in clusters
-  pd = cbind(pd, celltype = ct[match(rownames(pd), ct[,1]),2])
-  tab <- table(pd[,3:4])
-  tab <- tab/rowSums(tab)
-  pd <- melt(tab)
-  pd$clu <- factor(as.character(pd$clu), levels = seq(1,max(pd$clu)))
-  pdf(paste0(plotdir, 'celltype_composition_for_cluster.pdf'), width = 9, height = 5)
-  print(ggplot(data = pd) +
-    geom_bar(aes(x = clu, y = value, fill = celltype), stat = 'identity', position = 'dodge') +
-    theme_classic() +
-    ylab('Celltype Proportion') +
-    scale_fill_manual(values = mypalette(length(unique(pd$celltype)))))
-  dev.off()
-  ### mclust
-  mcl <- exprmclust(t(pr),cluster=clu,reduce=F)
-  # mcl <- exprmclust(t(pr), reduce = F)
-  pdf(paste0(plotdir, 'mcl.pdf'), width=8,height=8)
-  print(plotmclust(mcl, cell_point_size = 0.1))
-  dev.off()
-
-  # str(mcl)
-  # 
-  # --------------------
-  # construct pseudotime 
-  # --------------------
-  ## find origin
-  pd = data.frame(x = pr[,1], y = pr[,2], clu = as.factor(mcl$clusterid))
-  pd = cbind(pd, celltype = ct[match(rownames(pd), ct[,1]),2])
-  tab <- table(pd[,3:4])
-  tab <- tab/rowSums(tab)
-  pd <- melt(tab)
-  pd$clu <- factor(as.character(pd$clu), levels = seq(1,max(pd$clu)))
-  tmp <- pd[pd$celltype == origin.celltype, ]
-  origin.cluster <- as.numeric(tmp[which.max(tmp[,3]), 1])
-  
-  ## construct pseudotime
-  ord <- TSCANorder(mcl, startcluster = origin.cluster, listbranch = T,orderonly = T)
-  str(ord)
-  length(ord)
-  pt <- unlist(sapply(sapply(ord, length), function(i) seq(1, i)))
-  names(pt) <- unname(unlist(ord))
-  
-  # ## plot pseudotime
-  pd = data.frame(pc1 = pca[,1], pc2 = pca[,2], time = as.numeric(pt[rownames(pca)]))
-  library(scattermore)
-  library(RColorBrewer)
-  pdf(paste0(plotdir, 'pseudotime.pdf'), width = 7, height = 6)  
-  print(ggplot(data = pd, aes(x = pc1, y = pc2, color = time)) +
-    geom_scattermore() +
-    scale_color_gradient(low = 'yellow', high = 'blue'))
-  dev.off()
-  # ------------------------------------------------------------
-  # get candidate branches to test reproducibility, 20200726 >>
-  # ------------------------------------------------------------
-  
-  newbranch <- findbranch(mst = mcl$MSTtree, order = ord, origin = origin.cluster)  
-  
-  # -----------------------------------------------------
-  # Evaluate robustness of tree branches using resampling
-  # -----------------------------------------------------
-  
-  # null distribution of Jaccard index, overlap coefficient
-  
-  js.null <- lapply(seq(1, length(newbranch)), function(i) {
-    b.ori <- unlist(sapply(newbranch[[i]], function(c) names(mcl$clusterid[mcl$clusterid == c])))
-    tmp <- sapply(seq(1, 1e3), function(j){
-      set.seed(j)
-      b.pm <- sample(rownames(pr), length(b.ori))
-      length(intersect(b.pm, b.ori))/length(union(b.pm, b.ori))
-    })
-  })
-  
-  # par(mfrow = c(2,ceiling(length(js.null)/2)))
-  # for (i in js.null) hist(i, xlab = 'js', main = '', breaks = 50)
-  
-  js.cut <- sapply(js.null, quantile, 0.99)
-  
-  oc.null <- lapply(seq(1, length(newbranch)), function(i){
-    b.ori <- unlist(sapply(newbranch[[i]], function(c) names(mcl$clusterid[mcl$clusterid == c])))
-    tmp <- sapply(seq(1, 1e3), function(j){
-      set.seed(j)
-      b.pm <- sample(rownames(pr), length(b.ori))
-      length(intersect(b.pm, b.ori))/min(length(b.pm), length(b.ori))
-    })
-  })
-  # par(mfrow = c(2,ceiling(length(oc.null)/2)))
-  # for (i in oc.null) hist(i, xlab = 'oc', main = '', breaks = 50)
-  oc.cut <- sapply(oc.null, quantile, 0.99)
-  
-  mcl$pseudotime <- pt
-  mcl$branch <- newbranch
-  mcl$js.cut <- js.cut
-  mcl$oc.cut <- oc.cut
-  mcl$pca <- pr
-  mcl$order <- ord
-  mcl$allsample <- alls
-  return(mcl)
-}
-evaluate_uncertainty <- function(inferobj, n.permute){
-  pr <- inferobj$pca
-  newbranch <- inferobj$branch
-  js.cut <- inferobj$js.cut
-  oc.cut <- inferobj$oc.cut 
-  pt <- inferobj$pseudotime
-  ord <- inferobj$order
-  alls <- inferobj$allsample
-  ctcomplist <- reproduce.js <- reproduce.oc <- corr.score <- list()
-  for (pmid in seq(1, n.permute)){
-    print(pmid)
-    ## boostrap cells
-    set.seed(pmid)
-    pr.pm <- pr[sample(rownames(pr), nrow(pr), replace = TRUE),]
-    pr.pm <- pr.pm[!duplicated(rownames(pr.pm)),]
-    
-    ## cluster cells
-    clu <- mykmeans(pr.pm, number.cluster = 14)$cluster ###
-  
-    # --- check if these codes are necessary <<<<<<<<<<<<<<<<
-    # pd = data.frame(x = pr[names(clu),1], y = pr[names(clu),2], clu = as.factor(clu))
-    # pd.text.x = tapply(pd[,1], list(pd$clu), mean)
-    # pd.text.y = tapply(pd[,2], list(pd$clu), mean)
-    # pd.text = data.frame(x = pd.text.x, y = pd.text.y, clu = names(pd.text.x))
-    # pd.text[14,1:2] =  c(pd.text[14,1] + 2, pd.text[14,2] + 1)
-    # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>
-    # ggplot() + 
-    #   geom_scattermore(data = pd, aes(x = x, y = y, color = clu))+
-    #   scale_color_manual(values = mypalette(14))+
-    #   theme_classic() + xlab('UMAP1') + ylab('UMAP2') +
-    #   geom_text(data = pd.text, aes(x = x, y = y, label = clu))
-    
-  
-    ## cell type composition in clusters
-    # pd = cbind(pd, celltype = ct[match(rownames(pd), ct[,1]),2])
-    # tab <- table(pd[,3:4])
-    # tab <- tab/rowSums(tab)
-    # pd <- melt(tab)
-    # pd$clu <- factor(as.character(pd$clu), levels = seq(1,max(pd$clu)))
-    # ggplot(data = pd) +
-    #   geom_bar(aes(x = clu, y = value, fill = celltype), stat = 'identity', position = 'dodge') +
-    #   theme_classic() +
-    #   ylab('Celltype Proportion') +
-    #   scale_fill_manual(values = mypalette(length(unique(pd$celltype))))
-  
-    ## build pseudotime
-    mcl.pm <- exprmclust(t(pr.pm), cluster = clu, reduce = FALSE) ###
-    # plotmclust(mcl.pm, cell_point_size = 0.1)
-    
-    ## select origin cluster
-    pt.pm.mean<- tapply(pt[names(mcl.pm[['clusterid']])], list(mcl.pm[['clusterid']]), mean)
-    start.cluster <- names(which.min(pt.pm.mean))
-    
-    ## construct pseudotime
-    ord.pm <- TSCANorder(mcl.pm, startcluster = start.cluster, listbranch = T,orderonly = T)
-    # str(ord.pm)
-    
-    pt.pm <- unlist(sapply(sapply(ord.pm, length), function(i) seq(1, i)))
-    names(pt.pm) <- unname(unlist(ord.pm))
-    # --- check if these codes are necessary <<<<<<<<<<<<<<<<
-    ## plot pseudotime
-    
-    pd = data.frame(pc1 = pca[,1], pc2 = pca[,2], time = as.numeric(pt.pm[rownames(pca)]))
-    # ggplot(data = pd, aes(x = pc1, y = pc2, color = time)) +
-    #   geom_scattermore() + theme_classic()
-    # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>
-    
-    # get candidate branches
-    newbranch.pm <- findbranch(mst = mcl.pm$MSTtree, order = ord.pm, origin = start.cluster)
-    
-    ## compare two MST
-    js <- sapply(seq(1, length(newbranch)), function(i){
-            id <- which(sapply(paste0(names(ord),','), function(k) grepl(paste0(paste0(newbranch[[i]], collapse = ','),','), k)))[1]
-            cells <- ord[[id]]
-            b.ori <- intersect(unlist(sapply(newbranch[[i]], function(k) names(inferobj$clusterid)[inferobj$clusterid == k])), cells)
-            sapply(seq(1, length(newbranch.pm)), function(j){
-              
-              id <- which(sapply(paste0(names(ord.pm),','), function(k) grepl(paste0(paste0(newbranch.pm[[j]], collapse = ','),','), k)))[1]
-              cells <- ord.pm[[id]]
-              b.pm <- intersect(unlist(sapply(newbranch.pm[[j]], function(k) names(mcl.pm$clusterid)[mcl.pm$clusterid == k])), cells)
-              js <- length(intersect(b.pm, b.ori))/length(union(b.pm, b.ori))
-            })
-          })
-    oc <- sapply(seq(1, length(newbranch)), function(i){
-              id <- which(sapply(paste0(names(ord),','), function(k) grepl(paste0(paste0(newbranch[[i]], collapse = ','),','), k)))[1]
-              cells <- ord[[id]]
-              b.ori <- intersect(unlist(sapply(newbranch[[i]], function(k) names(inferobj$clusterid)[inferobj$clusterid == k])), cells)
-              sapply(seq(1, length(newbranch.pm)), function(j){
-                  id <- which(sapply(paste0(names(ord.pm),','), function(k) grepl(paste0(paste0(newbranch.pm[[j]], collapse = ','),','), k)))[1]
-                  cells <- ord.pm[[id]]
-                  b.pm <- intersect(unlist(sapply(newbranch.pm[[j]], function(k) names(mcl.pm$clusterid)[mcl.pm$clusterid == k])), cells)
-                  oc <- length(intersect(b.pm, b.ori))/min(length(b.pm), length(b.ori))
-             }) 
-          })
-    corr <- sapply(seq(1, length(newbranch)), function(i){
-                id <- which(sapply(paste0(names(ord),','), function(k) grepl(paste0(paste0(newbranch[[i]], collapse = ','),','), k)))[1]
-                cells <- ord[[id]]
-                b.ori <- intersect(unlist(sapply(newbranch[[i]], function(k) names(inferobj$clusterid)[inferobj$clusterid == k])), cells)
-                
-                sapply(seq(1, length(newbranch.pm)), function(j){
-                    id <- which(sapply(paste0(names(ord.pm),','), function(k) grepl(paste0(paste0(newbranch.pm[[j]], collapse = ','),','), k)))[1]
-                    cells <- ord.pm[[id]]
-                    b.pm <- intersect(unlist(sapply(newbranch.pm[[j]], function(k) names(mcl.pm$clusterid)[mcl.pm$clusterid == k])), cells)
-                    ov = intersect(b.ori, b.pm)
-                    cor(pt[ov], pt.pm[ov])
-                }) 
-            })
-    corr[is.na(corr)] <- 0
-    colnames(corr) <- colnames(oc) <- colnames(js) <- paste0('original', seq(1, length(newbranch)))
-    
-    ## get js binary to match branches 
-    js.binary <- get_binary(js, js.cut)
-    corr.score[[pmid]] <- corr * js.binary
-    js.melt <- melt(js.binary)
-    js.melt <- js.melt[js.melt[,3]!=0,]
-    colnames(js.melt) <- c('permutation.branch','original.branch','matched')
-    reproduce.js[[pmid]] <- as.character(js.melt[,2])
-    
-    ## get oc binary to match branches
-    oc.binary <- get_binary(oc, oc.cut)
-    oc.melt <- melt(oc.binary)
-    oc.melt <- oc.melt[oc.melt[,3]!=0,]
-    reproduce.oc[[pmid]] <- as.character(oc.melt[,2])
-  
-    ## samples cell compositions 
-    ctcomp <- sapply(js.melt[,2], function(tmp){
-      c <- names(clu)[clu %in% newbranch.pm[[tmp]]]
-      ctcomp <- rep(0, length(unique(alls)))
-      names(ctcomp) <- unique(alls)
-      ctcomp[names(table(alls[c]))] <- table(alls[c])
-    })
-    colnames(ctcomp) <- paste0('origin', js.melt[,2])
-    ctcomp <- ctcomp/rowSums(ctcomp)
-    
-    ctcomp.new <- matrix(0, nrow = length(unique(alls)), ncol = length(newbranch))
-    colnames(ctcomp.new) <- paste0('origin', seq(1, length(newbranch)))
-    rownames(ctcomp.new) <- unique(alls)
-    ctcomp.new[rownames(ctcomp), colnames(ctcomp)] <- ctcomp
-    ctcomplist[[pmid]] <- t(ctcomp.new)
-  }
-  
-  reproduce.js <- unlist(reproduce.js)  
-  js.perc <- rep(0, length(newbranch))
-  js.perc[as.numeric(names(table(reproduce.js)))] <-  table(reproduce.js)/n.permute
-  names(js.perc) <- newbranch
-  
-  reproduce.oc <- unlist(reproduce.oc)  
-  oc.perc <- rep(0, length(newbranch))
-  oc.perc[as.numeric(names(table(reproduce.oc)))] <-  table(reproduce.oc)/n.permute
-  names(oc.perc) <- newbranch
-  
-  corr.score.m <- do.call(rbind, corr.score)
-  corr.score.v <- colSums(corr.score.m)/n.permute
-  names(corr.score.v) <- newbranch
-  
-  sort((js.perc + oc.perc)/2)
-  
-  detection.rate <- data.frame(detection.rate = (js.perc + oc.perc[names(js.perc)])/2, stringsAsFactors = FALSE)
-  sample.cellcomp.mean <- apply(simplify2array(ctcomplist), 1:2, mean)
-  sample.cellcomp.sd <- apply(simplify2array(ctcomplist), 1:2, sd)
-  rownames(sample.cellcomp.mean) <- newbranch[as.numeric(sub('origin', '', rownames(sample.cellcomp.mean)))]
-  rownames(sample.cellcomp.sd) <- newbranch[as.numeric(sub('origin', '', rownames(sample.cellcomp.sd)))]
-  
-  result <- list(detection.rate = detection.rate, 
-                 sample.cellcomp.mean = sample.cellcomp.mean, 
-                 sample.cellcomp.sd = sample.cellcomp.sd)
-  return(result)
-}
-
-# permutation 
-a = infer_tree_structure(pca = pca, ct = ct, origin.celltype = 'HSC', plotdir = plotdir)
-result <- evaluate_uncertainty(a, 100)
-saveRDS(result, '/Users/wenpinhou/Dropbox/trajectory_variability/tree_variability/result/result.rds')
diff --git a/hca_bone_marrow_data_analysis/tree_variability/auto_pc_auto_nclu_module_3traj/code/05_try_to_build_module_v3.R b/hca_bone_marrow_data_analysis/tree_variability/auto_pc_auto_nclu_module_3traj/code/05_try_to_build_module_v3.R
deleted file mode 100644
index ecfa77a..0000000
--- a/hca_bone_marrow_data_analysis/tree_variability/auto_pc_auto_nclu_module_3traj/code/05_try_to_build_module_v3.R
+++ /dev/null
@@ -1,39 +0,0 @@
-rm(list=ls())
-library(ggplot2)
-library(Seurat)
-library(reshape2)
-library(TSCAN)
-library(scattermore)
-library(RColorBrewer)
-suppressMessages(library(igraph))
-n.permute <- 3
-max.clunum <- 50
-source("/Users/wenpinhou/Dropbox/trajectory_variability/function/01_function.R")
-plotdir <- '/Users/wenpinhou/Dropbox/trajectory_variability/tree_variability/auto_pc_auto_nclu_module/plot/'
-rdir <- '/Users/wenpinhou/Dropbox/trajectory_variability/tree_variability/auto_pc_auto_nclu_module/result/'
-# --------------------------------------------------------------
-# input: seurat integrated object including:
-# low dim reduction: umap, pca, or phate
-# celltype: a dataframe, col 1 is cell name, col 2 is cell type (at least for the cells with origin cell type), col 3 is sample name
-# origin: the origin cell type
-# --------------------------------------------------------------
-# read in data
-umap = readRDS('/Users/wenpinhou/Dropbox/trajectory_variability/hca/data/HCA/proc/integrate/umap.rds')
-pca <- as.matrix(umap@reductions$pca@cell.embeddings)
-a = readRDS('/Users/wenpinhou/Dropbox/trajectory_variability/hca/data/HCA/proc/ct/sc.rds')
-ct = data.frame(cell = names(a), celltype = a, sample = sapply(names(a), function(i) sub(':.*', '', i)), stringsAsFactors = FALSE)
-  
-# permutation 
-a = infer_tree_structure(pca = pca, ct = ct, origin.celltype = 'HSC', plotdir = plotdir, xlab='Principal Component 1', ylab = 'Principal Component 2')
-pdf(paste0(plotdir, 'mcl.pdf'), width=5.5,height=4.5)
-print(plotmclust(a, cell_point_size = 0.1, x.lab = 'Pincipal Component 1', y.lab = 'Principal Component 2'))
-dev.off()
-result <- evaluate_uncertainty(a, 100)
-saveRDS(result, paste0(rdir, 'result.rds'))
-
-for (i in names(result)){
-  write.csv(result[[i]], paste0(rdir, i, '.csv'), row.names = T)
-}
-
-
-
diff --git a/hca_bone_marrow_data_analysis/tree_variability/auto_pc_auto_nclu_module_3traj/code/06_build_module_all_rmall_rmBM1256.R b/hca_bone_marrow_data_analysis/tree_variability/auto_pc_auto_nclu_module_3traj/code/06_build_module_all_rmall_rmBM1256.R
deleted file mode 100644
index 05771ad..0000000
--- a/hca_bone_marrow_data_analysis/tree_variability/auto_pc_auto_nclu_module_3traj/code/06_build_module_all_rmall_rmBM1256.R
+++ /dev/null
@@ -1,94 +0,0 @@
-rm(list=ls())
-library(ggplot2)
-library(Seurat)
-library(reshape2)
-library(TSCAN)
-library(scattermore)
-library(RColorBrewer)
-suppressMessages(library(igraph))
-n.permute <- 10000
-max.clunum <- 50
-source("/Users/wenpinhou/Dropbox/trajectory_variability/function/01_function.R")
-plotdir <- '/Users/wenpinhou/Dropbox/trajectory_variability/tree_variability/auto_pc_auto_nclu_module_3traj/plot/'
-rdir <- '/Users/wenpinhou/Dropbox/trajectory_variability/tree_variability/auto_pc_auto_nclu_module_3traj/result/'
-# --------------------------------------------------------------
-# input: seurat integrated object including:
-# low dim reduction: umap, pca, or phate
-# celltype: a dataframe, col 1 is cell name, col 2 is cell type (at least for the cells with origin cell type), col 3 is sample name
-# origin: the origin cell type
-# --------------------------------------------------------------
-# read in data
-umap = readRDS('/Users/wenpinhou/Dropbox/trajectory_variability/hca/data/HCA/proc/integrate/ser/umap.rds')
-pca <- as.matrix(umap@reductions$pca@cell.embeddings)
-a = readRDS('/Users/wenpinhou/Dropbox/trajectory_variability/hca/data/HCA/proc/ct/sc.rds')
-ct = data.frame(cell = names(a), celltype = a, sample = sapply(names(a), function(i) sub(':.*', '', i)), stringsAsFactors = FALSE)
-  
-# permutation 
-res = infer_tree_structure(pca = pca, ct = ct, origin.celltype = 'HSC', plotdir = plotdir, xlab='Principal Component 1', ylab = 'Principal Component 2', original = T)
-saveRDS(res, paste0(rdir, 'infer_tree_structure_res.rds'))
-png(paste0(plotdir, 'mcl.png'), width=900,height=800, res = 200)
-plotmclust(res, cell_point_size = 0.1, x.lab = 'Principal Component 1', y.lab = 'Principal Component 2')
-dev.off()
-result <- evaluate_uncertainty(res, n.permute)
-saveRDS(result, paste0(rdir, 'result.rds'))
-for (i in names(result)){
-  write.csv(result[[i]], paste0(rdir, i, '.csv'), row.names = T)
-}
-
-## subsample cells, and then redo infer tree structure
-# ---------------
-# for all samples
-# ---------------
-for (rm.perc in seq(0.1, 0.8, 0.1)){
-  print(rm.perc)
-  plotdir <- paste0('/Users/wenpinhou/Dropbox/trajectory_variability/tree_variability/rmall/', rm.perc, '/plot/')
-  rdir <- paste0('/Users/wenpinhou/Dropbox/trajectory_variability/tree_variability/rmall/', rm.perc, '/result/')
-  dir.create(plotdir, recursive = T)
-  dir.create(rdir, recursive = T)
-  
-  selectcell = res$order[[2]]  ## get branch 5,1 cells
-  set.seed(12345)
-  rmcell = sample(selectcell, rm.perc*length(selectcell)) 
-  subset.cell = setdiff(rownames(pca), rmcell) ## remove a percentage of cells from branch 5,1
-
-  pdf(paste0(plotdir, 'mcl.pdf'), width=6,height=5)
-  print(plotmclust(res, cell_point_size = 0.1, x.lab = 'PC1', y.lab = 'PC2', subset.cell = subset.cell))
-  dev.off()
-  result <- evaluate_uncertainty(res, n.permute, subset.cell = subset.cell)
-  saveRDS(result, paste0(rdir, 'result.rds'))
-  for (i in names(result)){
-    write.csv(result[[i]], paste0(rdir, i, '.csv'), row.names = T)
-  }
-}
-  
-
-# ----------------------------
-# for some samples: BM1,2,5,6
-# ----------------------------
-for (rm.perc in seq(0.1, 0.8, 0.1)){
-  print(rm.perc)
-  plotdir <- paste0('/Users/wenpinhou/Dropbox/trajectory_variability/tree_variability/rmBM1256/', rm.perc, '/plot/')
-  rdir <- paste0('/Users/wenpinhou/Dropbox/trajectory_variability/tree_variability/rmBM1256/', rm.perc, '/result/')
-  dir.create(plotdir, recursive = T)
-  dir.create(rdir, recursive = T)
-  
-  selectcell = res$order[[2]]
-  selectcell = selectcell[ct[selectcell, 'sample'] %in% c('BM1', 'BM2', 'BM5', 'BM6')]
-  set.seed(12345)
-  rmcell = sample(selectcell, rm.perc*length(selectcell))
-  subset.cell = setdiff(rownames(pca), rmcell)
-
-  pdf(paste0(plotdir, 'mcl.pdf'), width=6,height=5)
-  print(plotmclust(res, cell_point_size = 0.1, x.lab = 'PC1', y.lab = 'PC2', subset.cell = subset.cell))
-  dev.off()
-  result <- evaluate_uncertainty(res, n.permute, subset.cell = subset.cell)
-  saveRDS(result, paste0(rdir, 'result.rds'))
-  for (i in names(result)){
-    write.csv(result[[i]], paste0(rdir, i, '.csv'), row.names = T)
-  }
-}
-  
-
-
-
-
diff --git a/hca_bone_marrow_data_analysis/tree_variability/code/01_reproducibility.R b/hca_bone_marrow_data_analysis/tree_variability/code/01_reproducibility.R
deleted file mode 100644
index 6a0319e..0000000
--- a/hca_bone_marrow_data_analysis/tree_variability/code/01_reproducibility.R
+++ /dev/null
@@ -1,274 +0,0 @@
-rm(list=ls())
-library(ggplot2)
-library(Seurat)
-library(reshape2)
-library(TSCAN)
-library(scattermore)
-library(RColorBrewer)
-suppressMessages(library(igraph))
-n.permute <- 1e3
-setwd("/Users/wenpinhou/Dropbox/trajectory_variability/hca/data/HCA/proc/integrate")
-umap = readRDS('umap.rds')
-pca <- as.matrix(umap@reductions$pca@cell.embeddings)
-ctlevel <- data.frame(ct=c('HSC','MPP','LMPP','CMP','CLP','GMP','MEP',"Bcell","CD4Tcell","CD8Tcell",'NKcell','Mono','Ery'),level=c(1,2,3,3,4,4,4,5,5,5,5,5,5),immunepath=c(1,1,1,0,1,0,0,1,1,1,1,0,0),monopath=c(1,1,1,1,0,1,0,0,0,0,0,1,0),erypath=c(1,1,0,1,0,0,1,0,0,0,0,0,1),stringsAsFactors = F)
-str(pca)
-a = readRDS('/Users/wenpinhou/Dropbox/trajectory_variability/hca/data/HCA/proc/ct/sc.rds')
-ct = data.frame(cell = names(a), celltype = a, stringsAsFactors = FALSE)
-
-mykmeans <- function(matrix, number.cluster = NA){
-  ## cluster the rows
-  set.seed(12345)
-  library(parallel)
-  if (is.na(number.cluster)){
-    maxclunum <- 20
-    rss <- mclapply(1:maxclunum,function(clunum) {
-      tmp <- kmeans(matrix,clunum,iter.max = 1000)
-      tmp$betweenss/tmp$totss
-    },mc.cores=20)
-    rss <- unlist(rss)
-    x <- 1:maxclunum
-    optclunum <- which.min(sapply(1:maxclunum, function(i) {
-        x2 <- pmax(0, x - i)
-        sum(lm(rss ~ x + x2)$residuals^2)  ## check this
-    }))
-    clu <- kmeans(matrix,optclunum)
-  } else {
-    clu <- kmeans(matrix, number.cluster)    
-  }
-    return(clu)
-}
-
-### determine numPC
-set.seed(12345)
-sdev <- apply(pca, 2, sd)
-x <- 1:20
-optpoint <- which.min(sapply(2:20, function(i) {
-  x2 <- pmax(0, x - i)
-  sum(lm(sdev[1:20] ~ x + x2)$residuals^2)
-}))
-pcadim = optpoint + 1
-pr <- pca[,1:pcadim]  # 2
-
-### mclust
-# mcl <- exprmclust(t(pr),cluster=clu,reduce=F)
-mcl <- exprmclust(t(pr), reduce = F)
-plotmclust(mcl, cell_point_size = 0.1)
-str(mcl)
-## find origin
-pd = data.frame(x = pr[,1], y = pr[,2], clu = as.factor(mcl$clusterid))
-pd = cbind(pd, celltype = ct[match(rownames(pd), ct[,1]),2])
-tab <- table(pd[,3:4])
-tab <- tab/rowSums(tab)
-pd <- melt(tab)
-pd$clu <- factor(as.character(pd$clu), levels = seq(1,max(pd$clu)))
-
-tmp <- pd[pd$celltype == 'HSC', ]
-origin.cluster <- as.numeric(tmp[which.max(tmp[,3]), 1])
-
-## construct pseudotime
-ord <- TSCANorder(mcl, startcluster = origin.cluster, listbranch = T,orderonly = T)
-str(ord)
-length(ord)
-pt <- unlist(sapply(sapply(ord, length), function(i) seq(1, i)))
-names(pt) <- unname(unlist(ord))
-
-
-## plot pseudotime
-pd = data.frame(pc1 = pca[,1], pc2 = pca[,2], time = as.numeric(pt[rownames(pca)]))
-library(scattermore)
-library(RColorBrewer)
-ggplot(data = pd, aes(x = pc1, y = pc2, color = time)) +
-  geom_scattermore() +
-  scale_color_gradient(low = 'yellow', high = 'blue')
-
-# -------------------------------------------------------
-# null distribution of Jaccard index, overlap coefficient
-# -------------------------------------------------------
-js.null <- lapply(seq(1, length(ord)), function(i){
-  b.ori <- ord[[i]]
-  tmp <- sapply(seq(1, 1e3), function(j){
-    set.seed(j)
-    b.pm <- sample(rownames(pr), length(b.ori))
-    length(intersect(b.pm, b.ori))/length(union(b.pm, b.ori))
-  })
-})
-
-par(mfrow = c(1,3))
-hist(js.null[[1]])
-hist(js.null[[2]])
-hist(js.null[[3]])
-
-js.cut <- sapply(js.null, quantile, 0.99)
-
-oc.null <- lapply(seq(1, length(ord)), function(i){
-  b.ori <- ord[[i]]
-  tmp <- sapply(seq(1, 1e3), function(j){
-    set.seed(j)
-    b.pm <- sample(rownames(pr), length(b.ori))
-    length(intersect(b.pm, b.ori))/min(length(b.pm), length(b.ori))
-  })
-})
-par(mfrow = c(1,3))
-hist(oc.null[[1]])
-hist(oc.null[[2]])
-hist(oc.null[[3]])
-
-oc.cut <- sapply(oc.null, quantile, 0.99)
-
-# -----------
-# permutation 
-# -----------
-corrlist <- jslist <- oclist <- list()
-for (pmid in seq(1, n.permute)){
-  ## boostrap cells
-  print(pmid)
-  set.seed(pmid)
-  pr.pm <- pr[sample(rownames(pr), nrow(pr), replace = TRUE),]
-  pr.pm <- pr.pm[!duplicated(rownames(pr.pm)),]
-  
-  # ## cluster cells
-  mcl.pm <- exprmclust(t(pr.pm), reduce = FALSE) ###
-  # plotmclust(mcl.pm, cell_point_size = 0.1)
-  
-  ## select origin cluster
-  pt.pm.mean<- tapply(pt[names(mcl.pm[['clusterid']])], list(mcl.pm[['clusterid']]), mean)
-  start.cluster <- names(which.min(pt.pm.mean))
-  
-  ## construct pseudotime
-  ord.pm <- TSCANorder(mcl.pm, startcluster = start.cluster, listbranch = T,orderonly = T)
-  # str(ord.pm)
-  
-  ## plot pseudotime
-  pt.pm <- unlist(sapply(sapply(ord.pm, length), function(i) seq(1, i)))
-  names(pt.pm) <- unname(unlist(ord.pm))
-  pd = data.frame(pc1 = pca[,1], pc2 = pca[,2], time = as.numeric(pt.pm[rownames(pca)]))
-  # ggplot(data = pd, aes(x = pc1, y = pc2, color = time)) +
-  #   geom_scattermore()
-
-  ## compare two MST
-  js <- sapply(seq(1, length(ord)), function(i){
-          sapply(seq(1, length(ord.pm)), function(j){
-            b.ori <- ord[[i]]
-            b.pm <- ord.pm[[j]]
-            js <- length(intersect(b.pm, b.ori))/length(union(b.pm, b.ori))
-          })
-      })
-  oc <- sapply(seq(1, length(ord)), function(i){
-           sapply(seq(1, length(ord.pm)), function(j){
-              b.ori <- ord[[i]]
-              b.pm <- ord.pm[[j]]
-              oc <- length(intersect(b.pm, b.ori))/min(length(b.pm), length(b.ori))
-           }) 
-        })
-  corr <- sapply(seq(1, length(ord)), function(i){
-           sapply(seq(1, length(ord.pm)), function(j){
-              ov = intersect(ord[[i]], ord.pm[[j]])
-              cor(pt[ov], pt.pm[ov])
-           }) 
-        })
-  corr[is.na(corr)] <- 0
-  colnames(corr) <- colnames(oc) <- colnames(js) <- paste0('original', seq(1, length(ord)))
-  jslist[[pmid]] <- js
-  oclist[[pmid]] <- oc
-  corrlist[[pmid]] <- corr
-}
-saveRDS(jslist, '/Users/wenpinhou/Dropbox/trajectory_variability/tree_variability/result/pm_js.rds')   
-saveRDS(oclist, '/Users/wenpinhou/Dropbox/trajectory_variability/tree_variability/result/pm_oc.rds')   
-
-saveRDS(corrlist, '/Users/wenpinhou/Dropbox/trajectory_variability/tree_variability/result/pm_oc.rds')   
-
-jsm <- do.call(rbind, jslist)
-ocm <- do.call(rbind, oclist)
-par(mfrow = c(1,2))
-hist(jsm)
-hist(ocm)
-
-res <- corr.score <- list()
-for (i in seq(1, length(jslist))){
-  js <- jslist[[i]]
-  js.binary <- sapply(seq(1,ncol(js)), function(c){
-    (js[,c] > js.cut[c]) + 0
-  })
-  while (length(which(rowSums(js.binary) > 1)) > 0 | length(which(colSums(js.binary) > 1)) > 0){
-    dup.id <- which(rowSums(js.binary) > 1)
-    if (length(dup.id) == 1){
-      addid <- which.max(js[dup.id, ])
-      js.binary[dup.id, ] <- 0
-      js.binary[dup.id, addid] <- 1  
-    } else if (length(dup.id) > 1) {
-      for (dup.i in dup.id){
-        addid <- which.max(js[dup.i, ])
-        js.binary[dup.i, ] <- 0
-        js.binary[dup.i, addid] <- 1  
-      }
-    }
-      
-    dup.id <- which(colSums(js.binary) > 1)
-    if (length(dup.id) == 1){
-      addid <- which.max(js[, dup.id])
-      js.binary[dup.id, ] <- 0
-      js.binary[addid, dup.id] <- 1  
-    } else if (length(dup.id) > 1) {
-      for (dup.i in dup.id){
-        addid <- which.max(js[, dup.id])
-        js.binary[, dup.id] <- 0
-        js.binary[addid, dup.id] <- 1  
-      }
-    }
-  }
-  
-  
-  corr.score[[i]] <- corrlist[[i]] * js.binary
-  js.melt <- melt(js.binary)
-  js.melt <- js.melt[js.melt[,3]!=0,]
-  res[[i]] <- as.character(js.melt[,2])
-}
-res <- unlist(res)  
-js.perc <- table(res)/n.permute
-saveRDS(js.perc, '/Users/wenpinhou/Dropbox/trajectory_variability/tree_variability/result/js_percentage.rds')
-
-corr.score.m <- do.call(rbind, corr.score)
-corr.score.v <- colSums(corr.score.m)/n.permute
-saveRDS(corr.score.v, '/Users/wenpinhou/Dropbox/trajectory_variability/tree_variability/result/corr_score.rds')
-
-res <- sapply(seq(1,length(oclist)), function(i){
-  oc <- oclist[[i]]
-  oc.binary <- sapply(seq(1,ncol(oc)), function(c){
-    (oc[,c] > oc.cut[c]) + 0
-  })
-  while (length(which(rowSums(oc.binary) > 1)) > 0 | length(which(colSums(oc.binary) > 1)) > 0){
-    dup.id <- which(rowSums(oc.binary) > 1)
-    if (length(dup.id) == 1){
-      addid <- which.max(oc[dup.id, ])
-      oc.binary[dup.id, ] <- 0
-      oc.binary[dup.id, addid] <- 1  
-    } else if (length(dup.id) > 1) {
-      for (dup.i in dup.id){
-        addid <- which.max(oc[dup.i, ])
-        oc.binary[dup.i, ] <- 0
-        oc.binary[dup.i, addid] <- 1  
-      }
-    }
-    dup.id <- which(colSums(oc.binary) > 1)
-    if (length(dup.id) == 1){
-      addid <- which.max(oc[, dup.id])
-      oc.binary[, dup.id] <- 0
-      oc.binary[addid, dup.id] <- 1  
-    } else if (length(dup.id) > 1) {
-      for (dup.i in dup.id){
-        addid <- which.max(oc[, dup.i])
-        oc.binary[, dup.i] <- 0
-        oc.binary[addid, dup.i] <- 1  
-      }
-    }
-  }
-  oc.melt <- melt(oc.binary)
-  oc.melt <- oc.melt[oc.melt[,3]!=0,]
-  as.character(oc.melt[,2])
-})
-res <- unlist(res)  
-oc.perc <- table(res)/n.permute
-sort((js.perc + oc.perc)/2)
-saveRDS(oc.perc, '/Users/wenpinhou/Dropbox/trajectory_variability/tree_variability/result/oc_percentage.rds')
-
-
diff --git a/hca_bone_marrow_data_analysis/tree_variability/code/02_samples_reproducibility.R b/hca_bone_marrow_data_analysis/tree_variability/code/02_samples_reproducibility.R
deleted file mode 100644
index 82056b3..0000000
--- a/hca_bone_marrow_data_analysis/tree_variability/code/02_samples_reproducibility.R
+++ /dev/null
@@ -1,452 +0,0 @@
-rm(list=ls())
-library(ggplot2)
-library(Seurat)
-library(reshape2)
-library(TSCAN)
-library(scattermore)
-library(RColorBrewer)
-suppressMessages(library(igraph))
-n.permute <- 1e3
-max.clunum <- 50
-setwd("/Users/wenpinhou/Dropbox/trajectory_variability/hca/data/HCA/proc/integrate")
-umap = readRDS('umap.rds')
-pca <- as.matrix(umap@reductions$pca@cell.embeddings)
-ctlevel <- data.frame(ct=c('HSC','MPP','LMPP','CMP','CLP','GMP','MEP',"Bcell","CD4Tcell","CD8Tcell",'NKcell','Mono','Ery'),level=c(1,2,3,3,4,4,4,5,5,5,5,5,5),immunepath=c(1,1,1,0,1,0,0,1,1,1,1,0,0),monopath=c(1,1,1,1,0,1,0,0,0,0,0,1,0),erypath=c(1,1,0,1,0,0,1,0,0,0,0,0,1),stringsAsFactors = F)
-str(pca)
-a = readRDS('/Users/wenpinhou/Dropbox/trajectory_variability/hca/data/HCA/proc/ct/sc.rds')
-ct = data.frame(cell = names(a), celltype = a, stringsAsFactors = FALSE)
-
-mykmeans <- function(matrix, number.cluster = NA){
-  ## cluster the rows
-  set.seed(12345)
-  library(parallel)
-  if (is.na(number.cluster)){
-    maxclunum <- 20
-    rss <- mclapply(1:maxclunum,function(clunum) {
-      tmp <- kmeans(matrix,clunum,iter.max = 1000)
-      tmp$betweenss/tmp$totss
-    },mc.cores=20)
-    rss <- unlist(rss)
-    x <- 1:maxclunum
-    optclunum <- which.min(sapply(1:maxclunum, function(i) {
-        x2 <- pmax(0, x - i)
-        sum(lm(rss ~ x + x2)$residuals^2)  ## check this
-    }))
-    clu <- kmeans(matrix,optclunum)
-  } else {
-    clu <- kmeans(matrix, number.cluster)    
-  }
-    return(clu)
-}
-
-### determine numPC
-set.seed(12345)
-sdev <- apply(pca, 2, sd)
-x <- 1:max.clunum
-optpoint <- which.min(sapply(2:max.clunum, function(i) {
-  x2 <- pmax(0, x - i)
-  sum(lm(sdev[1:max.clunum] ~ x + x2)$residuals^2)
-}))
-pcadim = optpoint + 1
-pr <- pca[,1:pcadim]  # 7
-
-## clustering
-clu <- mykmeans(pr, number.cluster = 14)$cluster
-pd = data.frame(x = pr[,1], y = pr[,2], clu = as.factor(clu[rownames(pr)]))
-
-# mypalette = colorRampPalette(brewer.pal(9,'Set1'))
-# ggplot(data = pd, aes(x = x, y = y, color = clu)) + 
-#   geom_scattermore()+
-#   scale_color_manual(values = mypalette(14))+
-#   theme_classic() + xlab('UMAP1') + ylab('UMAP2')
-
-## cell type composition in clusters
-pd = cbind(pd, celltype = ct[match(rownames(pd), ct[,1]),2])
-tab <- table(pd[,3:4])
-tab <- tab/rowSums(tab)
-pd <- melt(tab)
-pd$clu <- factor(as.character(pd$clu), levels = seq(1,max(pd$clu)))
-
-# ggplot(data = pd) +
-#   geom_bar(aes(x = clu, y = value, fill = celltype), stat = 'identity', position = 'dodge') +
-#   theme_classic() +
-#   ylab('Celltype Proportion') +
-#   scale_fill_manual(values = mypalette(length(unique(pd$celltype))))
-
-### mclust
-mcl <- exprmclust(t(pr),cluster=clu,reduce=F)
-# mcl <- exprmclust(t(pr), reduce = F)
-# plotmclust(mcl, cell_point_size = 0.1)
-# str(mcl)
-
-# --------------------
-# construct pseudotime 
-# --------------------
-## find origin
-pd = data.frame(x = pr[,1], y = pr[,2], clu = as.factor(mcl$clusterid))
-pd = cbind(pd, celltype = ct[match(rownames(pd), ct[,1]),2])
-tab <- table(pd[,3:4])
-tab <- tab/rowSums(tab)
-pd <- melt(tab)
-pd$clu <- factor(as.character(pd$clu), levels = seq(1,max(pd$clu)))
-tmp <- pd[pd$celltype == 'HSC', ]
-origin.cluster <- as.numeric(tmp[which.max(tmp[,3]), 1])
-
-## construct pseudotime
-ord <- TSCANorder(mcl, startcluster = origin.cluster, listbranch = T,orderonly = T)
-str(ord)
-length(ord)
-pt <- unlist(sapply(sapply(ord, length), function(i) seq(1, i)))
-names(pt) <- unname(unlist(ord))
-
-# ## plot pseudotime
-# pd = data.frame(pc1 = pca[,1], pc2 = pca[,2], time = as.numeric(pt[rownames(pca)]))
-# library(scattermore)
-# library(RColorBrewer)
-# ggplot(data = pd, aes(x = pc1, y = pc2, color = time)) +
-#   geom_scattermore() +
-#   scale_color_gradient(low = 'yellow', high = 'blue')
-
-# ------------------------------------------------------------
-# get candidate branches to test reproducibility, 20200726 >>
-# ------------------------------------------------------------
-findbranch <- function(mst, order, origin){
-  deg <- degree(mst)
-  vertex <- names(deg[which(deg > 2 | deg == 1)])
-  if (!origin %in% vertex) vertex <- c(origin, vertex)
-  eg <- expand.grid(1:length(vertex), 1:length(vertex))
-  eg <- eg[eg[,1]<eg[,2],]
-  eg = data.frame(vertex[eg[,1]], vertex[eg[,2]], stringsAsFactors = FALSE)
-  library(igraph)
-  tmpbranch <- lapply(seq(1,nrow(eg)), function(i){
-    sp <- shortest_paths(mst, from = eg[i,1], to = eg[i,2])$vpath[[1]]
-    if (sum(vertex %in% sp) == 2) as.vector(sp)
-  })
-  tmpbranch <- tmpbranch[sapply(tmpbranch, length) >0]  
- 
-  allbranch <- gsub('backbone ', '', gsub('branch: ', '', names(order)))
-  allbranch <- sapply(allbranch, function(i) strsplit(i, ',')[[1]])
-  allbranch <- paste0(names(allbranch), collapse = ' ')
-  newbranch <-sapply(tmpbranch, function(i) {
-      tmp <- paste0(i, collapse = ',')
-      if (!grepl(tmp, allbranch)){
-        rev(i)
-      } else {
-        i
-      }
-  })
-  return(newbranch)
-}
-newbranch <- findbranch(mst = mcl$MSTtree, order = ord, origin = origin.cluster)  
-
-
-# -------------------------------------------------------
-# null distribution of Jaccard index, overlap coefficient
-# -------------------------------------------------------
-## add here --------------->>>>>>
-## for samples
-## add here ---------------<<<<<<<
-js.null <- lapply(seq(1, length(newbranch)), function(i){
-  b.ori <- unlist(sapply(newbranch[[i]], function(c) names(mcl$clusterid[mcl$clusterid == c])))
-  b.ori.alls <- gsub(':.*', '', b.ori)
-  alls <- gsub(':.*', '', rownames(pr))
-  tmp <- mclapply(seq(1, 1e3), function(j){
-    set.seed(j)
-    b.pm <- sample(rownames(pr), length(b.ori))
-    b.pm.alls <- gsub(':.*', '', b.pm)
-    tmpp <- sapply(unique(alls), function(s){
-      b.pm.s <- b.pm[b.pm.alls == s]
-      b.ori.s <- b.ori[b.ori.alls == s]
-      length(intersect(b.pm.s, b.ori.s))/length(union(b.pm.s, b.ori.s))
-    })  
-  },mc.cores = detectCores()-2)
-  tmp <- do.call(rbind,tmp)
-})
-js.cut <- sapply(js.null, function(i) apply(i, 2, quantile, 0.99))
-# ------------------
-
-oc.null <- lapply(seq(1, length(newbranch)), function(i){
-  b.ori <- unlist(sapply(newbranch[[i]], function(c) names(mcl$clusterid[mcl$clusterid == c])))
-  b.ori.alls <- gsub(':.*', '', b.ori)
-  tmp <- mclapply(seq(1, 1e3), function(j){
-    set.seed(j)
-    b.pm <- sample(rownames(pr), length(b.ori))
-    b.pm.alls <- gsub(':.*', '', b.pm)
-    tmpp <- sapply(unique(alls), function(s){
-      b.pm.s <- b.pm[b.pm.alls == s]
-      b.ori.s <- b.ori[b.ori.alls == s]
-      length(intersect(b.pm.s, b.ori.s))/min(length(b.pm.s), length(b.ori.s))
-    })  
-  },mc.cores = detectCores()-2)
-  tmp <- do.call(rbind,tmp)
-})
-oc.cut <- sapply(oc.null, function(i) apply(i, 2, quantile, 0.99))
-
-# -----------
-# permutation 
-# -----------
-corrlist.alls <- jslist.alls <- oclist.alls <- list()
-n.permute = 100
-for (pmid in seq(1, n.permute)){
-  ## boostrap cells
-  print(pmid)
-  set.seed(pmid)
-  pr.pm <- pr[sample(rownames(pr), nrow(pr), replace = TRUE),]
-  pr.pm <- pr.pm[!duplicated(rownames(pr.pm)),]
-  
-  # ## cluster cells
-  clu <- mykmeans(pr.pm, number.cluster = 14)$cluster ###
-
-  pd = data.frame(x = pr[names(clu),1], y = pr[names(clu),2], clu = as.factor(clu))
-  pd.text.x = tapply(pd[,1], list(pd$clu), mean)
-  pd.text.y = tapply(pd[,2], list(pd$clu), mean)
-  pd.text = data.frame(x = pd.text.x, y = pd.text.y, clu = names(pd.text.x))
-  pd.text[14,1:2] =  c(pd.text[14,1] + 2, pd.text[14,2] + 1)
-
-  # ggplot() + 
-  #   geom_scattermore(data = pd, aes(x = x, y = y, color = clu))+
-  #   scale_color_manual(values = mypalette(14))+
-  #   theme_classic() + xlab('UMAP1') + ylab('UMAP2') +
-  #   geom_text(data = pd.text, aes(x = x, y = y, label = clu))
-  
-
-  ## cell type composition in clusters
-  # pd = cbind(pd, celltype = ct[match(rownames(pd), ct[,1]),2])
-  # tab <- table(pd[,3:4])
-  # tab <- tab/rowSums(tab)
-  # pd <- melt(tab)
-  # pd$clu <- factor(as.character(pd$clu), levels = seq(1,max(pd$clu)))
-  # ggplot(data = pd) +
-  #   geom_bar(aes(x = clu, y = value, fill = celltype), stat = 'identity', position = 'dodge') +
-  #   theme_classic() +
-  #   ylab('Celltype Proportion') +
-  #   scale_fill_manual(values = mypalette(length(unique(pd$celltype))))
-
-  # build pseudotime
-  mcl.pm <- exprmclust(t(pr.pm), cluster = clu, reduce = FALSE) ###
-  # plotmclust(mcl.pm, cell_point_size = 0.1)
-  
-  ## select origin cluster
-  pt.pm.mean<- tapply(pt[names(mcl.pm[['clusterid']])], list(mcl.pm[['clusterid']]), mean)
-  start.cluster <- names(which.min(pt.pm.mean))
-  
-  ## construct pseudotime
-  ord.pm <- TSCANorder(mcl.pm, startcluster = start.cluster, listbranch = T,orderonly = T)
-  # str(ord.pm)
-  
-  ## plot pseudotime
-  pt.pm <- unlist(sapply(sapply(ord.pm, length), function(i) seq(1, i)))
-  names(pt.pm) <- unname(unlist(ord.pm))
-  pd = data.frame(pc1 = pca[,1], pc2 = pca[,2], time = as.numeric(pt.pm[rownames(pca)]))
-  # ggplot(data = pd, aes(x = pc1, y = pc2, color = time)) +
-  #   geom_scattermore() + theme_classic()
-  
-  # get candidate branches
-  newbranch.pm <- findbranch(mst = mcl.pm$MSTtree, order = ord.pm, origin = start.cluster)
-  
-  ## compare two MST
-  js <- sapply(seq(1, length(newbranch)), function(i){
-          print('i')
-          print(i)
-          id <- which(sapply(paste0(names(ord),','), function(k) grepl(paste0(paste0(newbranch[[i]], collapse = ','),','), k)))[1]
-          cells <- ord[[id]]
-          b.ori <- intersect(unlist(sapply(newbranch[[i]], function(k) names(mcl$clusterid)[mcl$clusterid == k])), cells)
-          b.ori.alls <- gsub(':.*', '', b.ori)
-          tmp <- mclapply(seq(1, length(newbranch.pm)), function(j){
-            print(j)
-            id <- which(sapply(paste0(names(ord.pm),','), function(k) grepl(paste0(paste0(newbranch.pm[[j]], collapse = ','),','), k)))[1]
-            cells <- ord.pm[[id]]
-            b.pm <- intersect(unlist(sapply(newbranch.pm[[j]], function(k) names(mcl.pm$clusterid)[mcl.pm$clusterid == k])), cells)
-            b.pm.alls <- gsub(':.*', '', b.pm)
-            # js <- length(intersect(b.pm, b.ori))/length(union(b.pm, b.ori))
-            tmpp <- sapply(unique(alls), function(s){
-              b.pm.s <- b.pm[b.pm.alls == s]
-              b.ori.s <- b.ori[b.ori.alls == s]
-              length(intersect(b.pm.s, b.ori.s))/length(union(b.pm.s, b.ori.s))
-            })  
-          },mc.cores = detectCores()-2)
-          tmp <- do.call(rbind,tmp)
-          rownames(tmp) <- paste0('branch.pm', seq(1, length(newbranch.pm)))
-          tmp
-        }, simplify = FALSE)
-  names(js) <- paste0('branch', seq(1, length(newbranch)))
-  ###### =====================================
-  oc <- sapply(seq(1, length(newbranch)), function(i){
-            id <- which(sapply(paste0(names(ord),','), function(k) grepl(paste0(paste0(newbranch[[i]], collapse = ','),','), k)))[1]
-            cells <- ord[[id]]
-            b.ori <- intersect(unlist(sapply(newbranch[[i]], function(k) names(mcl$clusterid)[mcl$clusterid == k])), cells)
-            b.ori.alls <- gsub(':.*', '', b.ori)
-            tmp <- mclapply(seq(1, length(newbranch.pm)), function(j){
-                id <- which(sapply(paste0(names(ord.pm),','), function(k) grepl(paste0(paste0(newbranch.pm[[j]], collapse = ','),','), k)))[1]
-                cells <- ord.pm[[id]]
-                b.pm <- intersect(unlist(sapply(newbranch.pm[[j]], function(k) names(mcl.pm$clusterid)[mcl.pm$clusterid == k])), cells)
-                b.pm.alls <- gsub(':.*', '', b.pm)
-                # oc <- length(intersect(b.pm, b.ori))/min(length(b.pm), length(b.ori))
-                tmpp <- sapply(unique(alls), function(s){
-                  b.pm.s <- b.pm[b.pm.alls == s]
-                  b.ori.s <- b.ori[b.ori.alls == s]
-                  length(intersect(b.pm.s, b.ori.s))/min(length(b.pm.s), length(b.ori.s))
-                })  
-          },mc.cores = detectCores()-2)
-          tmp <- do.call(rbind,tmp)
-          rownames(tmp) <- paste0('branch.pm', seq(1, length(newbranch.pm)))
-          tmp
-        }, simplify = FALSE)
-  names(oc) <- paste0('branch', seq(1, length(newbranch)))           
-        
-  
-  corr <- sapply(seq(1, length(newbranch)), function(i){
-              id <- which(sapply(paste0(names(ord),','), function(k) grepl(paste0(paste0(newbranch[[i]], collapse = ','),','), k)))[1]
-              cells <- ord[[id]]
-              b.ori <- intersect(unlist(sapply(newbranch[[i]], function(k) names(mcl$clusterid)[mcl$clusterid == k])), cells)
-              b.ori.alls <- gsub(':.*', '', b.ori)
-              tmp <- mclapply(seq(1, length(newbranch.pm)), function(j){
-                  id <- which(sapply(paste0(names(ord.pm),','), function(k) grepl(paste0(paste0(newbranch.pm[[j]], collapse = ','),','), k)))[1]
-                  cells <- ord.pm[[id]]
-                  b.pm <- intersect(unlist(sapply(newbranch.pm[[j]], function(k) names(mcl.pm$clusterid)[mcl.pm$clusterid == k])), cells)
-                  b.pm.alls <- gsub(':.*', '', b.pm)
-                  # ov = intersect(b.ori, b.pm)
-                  # cor(pt[ov], pt.pm[ov])
-                  tmpp <- sapply(unique(alls), function(s){
-                    b.pm.s <- b.pm[b.pm.alls == s]
-                    b.ori.s <- b.ori[b.ori.alls == s] 
-                    ov = intersect(b.ori.s, b.pm.s)
-                    cor(pt[ov], pt.pm[ov])
-                  })
-              }, mc.cores = detectCores()-2) 
-              tmp <- do.call(rbind, tmp)
-              rownames(tmp) <- paste0('branch.pm', seq(1, length(newbranch.pm)))
-              tmp[is.na(tmp)] <- 0
-              tmp
-          }, simplify = FALSE)
-  # corr[is.na(corr)] <- 0
-  names(corr) <- paste0('branch', seq(1, length(newbranch)))           
-  # colnames(corr) <- colnames(oc) <- colnames(js) <- paste0('original', seq(1, length(newbranch)))
-  jslist.alls[[pmid]] <- js
-  oclist.alls[[pmid]] <- oc
-  corrlist.alls[[pmid]] <- corr
-}
-saveRDS(jslist.alls, '/Users/wenpinhou/Dropbox/trajectory_variability/tree_variability/result/samples/pm_js_alls.rds')   
-saveRDS(oclist.alls, '/Users/wenpinhou/Dropbox/trajectory_variability/tree_variability/result/samples/pm_oc_alls.rds')   
-saveRDS(corrlist.alls, '/Users/wenpinhou/Dropbox/trajectory_variability/tree_variability/samples/result/pm_oc_alls.rds')   
-
-# jsm <- do.call(rbind, jslist)
-# ocm <- do.call(rbind, oclist)
-# par(mfrow = c(1,2))
-# hist(jsm)
-# hist(ocm)
-s = unique(alls)[1]
-df.alls <- lapply(unique(alls), function(s){
-  jslist = sapply(jslist.alls, function(i){
-    sapply(i, function(ii) ii[,s])
-  }, simplify = FALSE)
-  oclist = sapply(oclist.alls, function(i){
-    sapply(i, function(ii) ii[,s])
-  }, simplify = FALSE)
-  corrlist = sapply(corrlist.alls, function(i){
-    sapply(i, function(ii) ii[,s])
-  }, simplify = FALSE)
-  
-  res <- corr.score <- list()
-  for (i in seq(1, length(jslist))){
-    print(i)
-    js <- jslist[[i]]
-    js.binary <- sapply(seq(1,ncol(js)), function(c){
-      (js[,c] > js.cut[c]) + 0
-    })
-    while (length(which(rowSums(js.binary) > 1)) > 0 | length(which(colSums(js.binary) > 1)) > 0){
-      dup.id <- which(rowSums(js.binary) > 1)
-      if (length(dup.id) == 1){
-        addid <- which.max(js[dup.id, ])
-        js.binary[dup.id, ] <- 0
-        js.binary[dup.id, addid] <- 1  
-      } else if (length(dup.id) > 1) {
-        for (dup.i in dup.id){
-          print(dup.i)
-          addid <- which.max(js[dup.i, ])
-          js.binary[dup.i, ] <- 0
-          js.binary[dup.i, addid] <- 1  
-        }
-      }
-        
-      dup.id <- which(colSums(js.binary) > 1)
-      if (length(dup.id) == 1){
-        addid <- which.max(js[, dup.id])
-        js.binary[, dup.id] <- 0
-        js.binary[addid, dup.id] <- 1  
-      } else if (length(dup.id) > 1) {
-        for (dup.i in dup.id){
-          addid <- which.max(js[, dup.i])
-          js.binary[, dup.i] <- 0
-          js.binary[addid, dup.i] <- 1  
-        }
-      }
-    }
-    
-    corr.score[[i]] <- corrlist[[i]] * js.binary
-    js.melt <- melt(js.binary)
-    js.melt <- js.melt[js.melt[,3]!=0,]
-    res[[i]] <- as.character(js.melt[,2])
-  }
-  res <- unlist(res)  
-  js.perc <- table(res)/n.permute
-  names(js.perc) <- newbranch
-  # saveRDS(js.perc, '/Users/wenpinhou/Dropbox/trajectory_variability/tree_variability/result/samples/js_percentage.rds')
-  
-  corr.score.m <- do.call(rbind, corr.score)
-  corr.score.v <- colSums(corr.score.m)/n.permute
-  names(corr.score.v) <- newbranch
-  # saveRDS(corr.score.v, '/Users/wenpinhou/Dropbox/trajectory_variability/tree_variability/result/samples/corr_score.rds')
-  
-  res <- sapply(seq(1,length(oclist)), function(i){
-    print(i)
-    oc <- oclist[[i]]
-    oc.binary <- sapply(seq(1,ncol(oc)), function(c){
-      (oc[,c] > oc.cut[c]) + 0
-    })
-    while (length(which(rowSums(oc.binary) > 1)) > 0 | length(which(colSums(oc.binary) > 1)) > 0){
-      dup.id <- which(rowSums(oc.binary) > 1)
-      if (length(dup.id) == 1){
-        addid <- which.max(oc[dup.id, ])
-        oc.binary[dup.id, ] <- 0
-        oc.binary[dup.id, addid] <- 1  
-      } else if (length(dup.id) > 1) {
-        for (dup.i in dup.id){
-          addid <- which.max(oc[dup.i, ])
-          oc.binary[dup.i, ] <- 0
-          oc.binary[dup.i, addid] <- 1  
-        }
-      }
-      dup.id <- which(colSums(oc.binary) > 1)
-      if (length(dup.id) == 1){
-        addid <- which.max(oc[, dup.id])
-        oc.binary[, dup.id] <- 0
-        oc.binary[addid, dup.id] <- 1  
-      } else if (length(dup.id) > 1) {
-        for (dup.i in dup.id){
-          addid <- which.max(oc[, dup.i])
-          oc.binary[, dup.i] <- 0
-          oc.binary[addid, dup.i] <- 1  
-        }
-      }
-    }
-    oc.melt <- melt(oc.binary)
-    oc.melt <- oc.melt[oc.melt[,3]!=0,]
-    as.character(oc.melt[,2])
-  })
-  res <- unlist(res)  
-  oc.perc <- table(res)/n.permute
-  names(oc.perc) <- newbranch
-  sort((js.perc + oc.perc)/2)
-  
-  df <- data.frame(js.perc = js.perc, oc.perc = oc.perc, corr.score.v = corr.score.v)
-  df <- df[, c(2,4,5)]
-
-# saveRDS(oc.perc, '/Users/wenpinhou/Dropbox/trajectory_variability/tree_variability/result/samples/oc_percentage.rds')
-  
-})
-names(df.alls) <- unique(alls)
-df.alls[order(names(df.alls))]
-
-
-
diff --git a/hca_bone_marrow_data_analysis/tree_variability/code/03_try_to_build_module.R b/hca_bone_marrow_data_analysis/tree_variability/code/03_try_to_build_module.R
deleted file mode 100644
index c822909..0000000
--- a/hca_bone_marrow_data_analysis/tree_variability/code/03_try_to_build_module.R
+++ /dev/null
@@ -1,443 +0,0 @@
-rm(list=ls())
-library(ggplot2)
-library(Seurat)
-library(reshape2)
-library(TSCAN)
-library(scattermore)
-library(RColorBrewer)
-suppressMessages(library(igraph))
-n.permute <- 3
-max.clunum <- 50
-setwd("/Users/wenpinhou/Dropbox/trajectory_variability/hca/data/HCA/proc/integrate")
-
-# --------------------------------------------------------------
-# input: seurat integrated object including:
-#  umap, pca
-# celltype: a dataframe, col 1 is cell name, col 2 is cell type
-# origin: the origin cell type
-# --------------------------------------------------------------
-# read in data
-umap = readRDS('umap.rds')
-pca <- as.matrix(umap@reductions$pca@cell.embeddings)
-# ctlevel <- data.frame(ct=c('HSC','MPP','LMPP','CMP','CLP','GMP','MEP',"Bcell","CD4Tcell","CD8Tcell",'NKcell','Mono','Ery'),level=c(1,2,3,3,4,4,4,5,5,5,5,5,5),immunepath=c(1,1,1,0,1,0,0,1,1,1,1,0,0),monopath=c(1,1,1,1,0,1,0,0,0,0,0,1,0),erypath=c(1,1,0,1,0,0,1,0,0,0,0,0,1),stringsAsFactors = F)
-str(pca)
-a = readRDS('/Users/wenpinhou/Dropbox/trajectory_variability/hca/data/HCA/proc/ct/sc.rds')
-ct = data.frame(cell = names(a), celltype = a, stringsAsFactors = FALSE)
-alls <- sub(':.*', '', names(a))
-names(alls) <- names(a)
-  
-mykmeans <- function(matrix, number.cluster = NA){
-  ## cluster the rows
-  set.seed(12345)
-  library(parallel)
-  if (is.na(number.cluster)){
-    maxclunum <- 20
-    rss <- mclapply(1:maxclunum,function(clunum) {
-      tmp <- kmeans(matrix,clunum,iter.max = 1000)
-      tmp$betweenss/tmp$totss
-    },mc.cores=20)
-    rss <- unlist(rss)
-    x <- 1:maxclunum
-    optclunum <- which.min(sapply(1:maxclunum, function(i) {
-        x2 <- pmax(0, x - i)
-        sum(lm(rss ~ x + x2)$residuals^2)  ## check this
-    }))
-    clu <- kmeans(matrix,optclunum)
-  } else {
-    clu <- kmeans(matrix, number.cluster)    
-  }
-    return(clu)
-}
-
-### determine numPC
-set.seed(12345)
-sdev <- apply(pca, 2, sd)
-x <- 1:max.clunum
-optpoint <- which.min(sapply(2:max.clunum, function(i) {
-  x2 <- pmax(0, x - i)
-  sum(lm(sdev[1:max.clunum] ~ x + x2)$residuals^2)
-}))
-pcadim = optpoint + 1
-pr <- pca[,1:pcadim]  # 7
-
-## clustering
-clu <- mykmeans(pr, number.cluster = 14)$cluster
-# pd = data.frame(x = pr[,1], y = pr[,2], clu = as.factor(clu[rownames(pr)]))
-# mypalette = colorRampPalette(brewer.pal(9,'Set1'))
-# ggplot(data = pd, aes(x = x, y = y, color = clu)) + 
-#   geom_scattermore()+
-#   scale_color_manual(values = mypalette(14))+
-#   theme_classic() + xlab('UMAP1') + ylab('UMAP2')
-
-# ## cell type composition in clusters
-# pd = cbind(pd, celltype = ct[match(rownames(pd), ct[,1]),2])
-# tab <- table(pd[,3:4])
-# tab <- tab/rowSums(tab)
-# pd <- melt(tab)
-# pd$clu <- factor(as.character(pd$clu), levels = seq(1,max(pd$clu)))
-# 
-# ggplot(data = pd) +
-#   geom_bar(aes(x = clu, y = value, fill = celltype), stat = 'identity', position = 'dodge') +
-#   theme_classic() +
-#   ylab('Celltype Proportion') +
-#   scale_fill_manual(values = mypalette(length(unique(pd$celltype))))
-
-### mclust
-mcl <- exprmclust(t(pr),cluster=clu,reduce=F)
-# mcl <- exprmclust(t(pr), reduce = F)
-# plotmclust(mcl, cell_point_size = 0.1)
-# str(mcl)
-
-# --------------------
-# construct pseudotime 
-# --------------------
-## find origin
-pd = data.frame(x = pr[,1], y = pr[,2], clu = as.factor(mcl$clusterid))
-pd = cbind(pd, celltype = ct[match(rownames(pd), ct[,1]),2])
-tab <- table(pd[,3:4])
-tab <- tab/rowSums(tab)
-pd <- melt(tab)
-pd$clu <- factor(as.character(pd$clu), levels = seq(1,max(pd$clu)))
-tmp <- pd[pd$celltype == 'HSC', ]
-origin.cluster <- as.numeric(tmp[which.max(tmp[,3]), 1])
-
-## construct pseudotime
-ord <- TSCANorder(mcl, startcluster = origin.cluster, listbranch = T,orderonly = T)
-str(ord)
-length(ord)
-pt <- unlist(sapply(sapply(ord, length), function(i) seq(1, i)))
-names(pt) <- unname(unlist(ord))
-
-# ## plot pseudotime
-# pd = data.frame(pc1 = pca[,1], pc2 = pca[,2], time = as.numeric(pt[rownames(pca)]))
-# library(scattermore)
-# library(RColorBrewer)
-# ggplot(data = pd, aes(x = pc1, y = pc2, color = time)) +
-#   geom_scattermore() +
-#   scale_color_gradient(low = 'yellow', high = 'blue')
-
-# ------------------------------------------------------------
-# get candidate branches to test reproducibility, 20200726 >>
-# ------------------------------------------------------------
-findbranch <- function(mst, order, origin){
-  deg <- degree(mst)
-  vertex <- names(deg[which(deg > 2 | deg == 1)])
-  if (!origin %in% vertex) vertex <- c(origin, vertex)
-  eg <- expand.grid(1:length(vertex), 1:length(vertex))
-  eg <- eg[eg[,1]<eg[,2],]
-  eg = data.frame(vertex[eg[,1]], vertex[eg[,2]], stringsAsFactors = FALSE)
-  library(igraph)
-  tmpbranch <- lapply(seq(1,nrow(eg)), function(i){
-    sp <- shortest_paths(mst, from = eg[i,1], to = eg[i,2])$vpath[[1]]
-    if (sum(vertex %in% sp) == 2) as.vector(sp)
-  })
-  tmpbranch <- tmpbranch[sapply(tmpbranch, length) >0]  
- 
-  allbranch <- gsub('backbone ', '', gsub('branch: ', '', names(order)))
-  allbranch <- sapply(allbranch, function(i) strsplit(i, ',')[[1]])
-  allbranch <- paste0(names(allbranch), collapse = ' ')
-  newbranch <-sapply(tmpbranch, function(i) {
-      tmp <- paste0(i, collapse = ',')
-      if (!grepl(tmp, allbranch)){
-        rev(i)
-      } else {
-        i
-      }
-  })
-  return(newbranch)
-}
-newbranch <- findbranch(mst = mcl$MSTtree, order = ord, origin = origin.cluster)  
-
-# -----------------------------------------------------
-# Evaluate robustness of tree branches using resampling
-# -----------------------------------------------------
-
-# null distribution of Jaccard index, overlap coefficient
-
-js.null <- lapply(seq(1, length(newbranch)), function(i) {
-  b.ori <- unlist(sapply(newbranch[[i]], function(c) names(mcl$clusterid[mcl$clusterid == c])))
-  tmp <- sapply(seq(1, 1e3), function(j){
-    set.seed(j)
-    b.pm <- sample(rownames(pr), length(b.ori))
-    length(intersect(b.pm, b.ori))/length(union(b.pm, b.ori))
-  })
-})
-
-# par(mfrow = c(2,ceiling(length(js.null)/2)))
-# for (i in js.null) hist(i, xlab = 'js', main = '', breaks = 50)
-
-js.cut <- sapply(js.null, quantile, 0.99)
-
-oc.null <- lapply(seq(1, length(newbranch)), function(i){
-  b.ori <- unlist(sapply(newbranch[[i]], function(c) names(mcl$clusterid[mcl$clusterid == c])))
-  tmp <- sapply(seq(1, 1e3), function(j){
-    set.seed(j)
-    b.pm <- sample(rownames(pr), length(b.ori))
-    length(intersect(b.pm, b.ori))/min(length(b.pm), length(b.ori))
-  })
-})
-# par(mfrow = c(2,ceiling(length(oc.null)/2)))
-# for (i in oc.null) hist(i, xlab = 'oc', main = '', breaks = 50)
-oc.cut <- sapply(oc.null, quantile, 0.99)
-
-# permutation 
-
-get_binary <- function(js){
-  js.binary <- sapply(seq(1,ncol(js)), function(c){
-    (js[,c] > js.cut[c]) + 0
-  })
-  while (length(which(rowSums(js.binary) > 1)) > 0 | length(which(colSums(js.binary) > 1)) > 0){
-    dup.id <- which(rowSums(js.binary) > 1)
-    if (length(dup.id) == 1){
-      addid <- which.max(js[dup.id, ])
-      js.binary[dup.id, ] <- 0
-      js.binary[dup.id, addid] <- 1  
-    } else if (length(dup.id) > 1) {
-      for (dup.i in dup.id){
-        addid <- which.max(js[dup.i, ])
-        js.binary[dup.i, ] <- 0
-        js.binary[dup.i, addid] <- 1  
-      }
-    }
-      
-    dup.id <- which(colSums(js.binary) > 1)
-    if (length(dup.id) == 1){
-      addid <- which.max(js[, dup.id])
-      js.binary[, dup.id] <- 0
-      js.binary[addid, dup.id] <- 1  
-    } else if (length(dup.id) > 1) {
-      for (dup.i in dup.id){
-        addid <- which.max(js[, dup.i])
-        js.binary[, dup.i] <- 0
-        js.binary[addid, dup.i] <- 1  
-      }
-    }
-  }
-  return(js.binary)
-}
-
-ctcomplist <- reproduce <- corr.score <- corrlist <- jslist <- oclist <- list()
-
-for (pmid in seq(1, n.permute)){
-  ## boostrap cells
-  print(pmid)
-  set.seed(pmid)
-  pr.pm <- pr[sample(rownames(pr), nrow(pr), replace = TRUE),]
-  pr.pm <- pr.pm[!duplicated(rownames(pr.pm)),]
-  
-  ## cluster cells
-  clu <- mykmeans(pr.pm, number.cluster = 14)$cluster ###
-
-  # --- check if these codes are necessary <<<<<<<<<<<<<<<<
-  pd = data.frame(x = pr[names(clu),1], y = pr[names(clu),2], clu = as.factor(clu))
-  pd.text.x = tapply(pd[,1], list(pd$clu), mean)
-  pd.text.y = tapply(pd[,2], list(pd$clu), mean)
-  pd.text = data.frame(x = pd.text.x, y = pd.text.y, clu = names(pd.text.x))
-  pd.text[14,1:2] =  c(pd.text[14,1] + 2, pd.text[14,2] + 1)
-  # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>
-  # ggplot() + 
-  #   geom_scattermore(data = pd, aes(x = x, y = y, color = clu))+
-  #   scale_color_manual(values = mypalette(14))+
-  #   theme_classic() + xlab('UMAP1') + ylab('UMAP2') +
-  #   geom_text(data = pd.text, aes(x = x, y = y, label = clu))
-  
-
-  ## cell type composition in clusters
-  # pd = cbind(pd, celltype = ct[match(rownames(pd), ct[,1]),2])
-  # tab <- table(pd[,3:4])
-  # tab <- tab/rowSums(tab)
-  # pd <- melt(tab)
-  # pd$clu <- factor(as.character(pd$clu), levels = seq(1,max(pd$clu)))
-  # ggplot(data = pd) +
-  #   geom_bar(aes(x = clu, y = value, fill = celltype), stat = 'identity', position = 'dodge') +
-  #   theme_classic() +
-  #   ylab('Celltype Proportion') +
-  #   scale_fill_manual(values = mypalette(length(unique(pd$celltype))))
-
-  ## build pseudotime
-  mcl.pm <- exprmclust(t(pr.pm), cluster = clu, reduce = FALSE) ###
-  # plotmclust(mcl.pm, cell_point_size = 0.1)
-  
-  ## select origin cluster
-  pt.pm.mean<- tapply(pt[names(mcl.pm[['clusterid']])], list(mcl.pm[['clusterid']]), mean)
-  start.cluster <- names(which.min(pt.pm.mean))
-  
-  ## construct pseudotime
-  ord.pm <- TSCANorder(mcl.pm, startcluster = start.cluster, listbranch = T,orderonly = T)
-  # str(ord.pm)
-  
-  pt.pm <- unlist(sapply(sapply(ord.pm, length), function(i) seq(1, i)))
-  names(pt.pm) <- unname(unlist(ord.pm))
-  # --- check if these codes are necessary <<<<<<<<<<<<<<<<
-  ## plot pseudotime
-  
-  pd = data.frame(pc1 = pca[,1], pc2 = pca[,2], time = as.numeric(pt.pm[rownames(pca)]))
-  # ggplot(data = pd, aes(x = pc1, y = pc2, color = time)) +
-  #   geom_scattermore() + theme_classic()
-  # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>
-  
-  # get candidate branches
-  newbranch.pm <- findbranch(mst = mcl.pm$MSTtree, order = ord.pm, origin = start.cluster)
-  
-  ## compare two MST
-  js <- sapply(seq(1, length(newbranch)), function(i){
-          id <- which(sapply(paste0(names(ord),','), function(k) grepl(paste0(paste0(newbranch[[i]], collapse = ','),','), k)))[1]
-          cells <- ord[[id]]
-          b.ori <- intersect(unlist(sapply(newbranch[[i]], function(k) names(mcl$clusterid)[mcl$clusterid == k])), cells)
-          sapply(seq(1, length(newbranch.pm)), function(j){
-            print(j)
-            id <- which(sapply(paste0(names(ord.pm),','), function(k) grepl(paste0(paste0(newbranch.pm[[j]], collapse = ','),','), k)))[1]
-            cells <- ord.pm[[id]]
-            b.pm <- intersect(unlist(sapply(newbranch.pm[[j]], function(k) names(mcl.pm$clusterid)[mcl.pm$clusterid == k])), cells)
-            js <- length(intersect(b.pm, b.ori))/length(union(b.pm, b.ori))
-          })
-        })
-  oc <- sapply(seq(1, length(newbranch)), function(i){
-            id <- which(sapply(paste0(names(ord),','), function(k) grepl(paste0(paste0(newbranch[[i]], collapse = ','),','), k)))[1]
-            cells <- ord[[id]]
-            b.ori <- intersect(unlist(sapply(newbranch[[i]], function(k) names(mcl$clusterid)[mcl$clusterid == k])), cells)
-            sapply(seq(1, length(newbranch.pm)), function(j){
-                id <- which(sapply(paste0(names(ord.pm),','), function(k) grepl(paste0(paste0(newbranch.pm[[j]], collapse = ','),','), k)))[1]
-                cells <- ord.pm[[id]]
-                b.pm <- intersect(unlist(sapply(newbranch.pm[[j]], function(k) names(mcl.pm$clusterid)[mcl.pm$clusterid == k])), cells)
-                oc <- length(intersect(b.pm, b.ori))/min(length(b.pm), length(b.ori))
-           }) 
-        })
-  corr <- sapply(seq(1, length(newbranch)), function(i){
-              id <- which(sapply(paste0(names(ord),','), function(k) grepl(paste0(paste0(newbranch[[i]], collapse = ','),','), k)))[1]
-              cells <- ord[[id]]
-              b.ori <- intersect(unlist(sapply(newbranch[[i]], function(k) names(mcl$clusterid)[mcl$clusterid == k])), cells)
-              
-              sapply(seq(1, length(newbranch.pm)), function(j){
-                  id <- which(sapply(paste0(names(ord.pm),','), function(k) grepl(paste0(paste0(newbranch.pm[[j]], collapse = ','),','), k)))[1]
-                  cells <- ord.pm[[id]]
-                  b.pm <- intersect(unlist(sapply(newbranch.pm[[j]], function(k) names(mcl.pm$clusterid)[mcl.pm$clusterid == k])), cells)
-                  ov = intersect(b.ori, b.pm)
-                  cor(pt[ov], pt.pm[ov])
-              }) 
-          })
-  corr[is.na(corr)] <- 0
-  colnames(corr) <- colnames(oc) <- colnames(js) <- paste0('original', seq(1, length(newbranch)))
-  jslist[[pmid]] <- js
-  oclist[[pmid]] <- oc
-  corrlist[[pmid]] <- corr
-  
-  ## get js binary to matched branches <<<<<<<<<<<<<<< 
-  js.binary <- get_binary(js)
-  corr.score[[pmid]] <- corr * js.binary
-  js.melt <- melt(js.binary)
-  js.melt <- js.melt[js.melt[,3]!=0,]
-  colnames(js.melt) <- c('permutation.branch','original.branch','matched')
-  reproduce[[pmid]] <- as.character(js.melt[,2])
-  ## >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>
-  ## 
-  tmp <- js.melt[1,2]
-  ctcomp <- sapply(js.melt[,2], function(tmp){
-    c <- names(clu)[clu %in% newbranch.pm[[tmp]]]
-    ctcomp <- rep(0, length(unique(alls)))
-    names(ctcomp) <- unique(alls)
-    ctcomp[names(table(alls[c]))] <- table(alls[c])
-  })
-  colnames(ctcomp) <- paste0('origin', js.melt[,2])
-  ctcomp <- ctcomp/rowSums(ctcomp)
-  
-  
-  ctcomp.new <- matrix(0, nrow = length(unique(alls)), ncol = length(newbranch))
-  colnames(ctcomp.new) <- paste0('origin', seq(1, length(newbranch)))
-  rownames(ctcomp.new) <- unique(alls)
-  ctcomp.new[rownames(ctcomp), colnames(ctcomp)] <- ctcomp
-  ctcomplist[[pmid]] <- t(ctcomp.new)
-  
-}
-
-# saveRDS(jslist, '/Users/wenpinhou/Dropbox/trajectory_variability/tree_variability/result/pm_js.rds')   
-# saveRDS(oclist, '/Users/wenpinhou/Dropbox/trajectory_variability/tree_variability/result/pm_oc.rds')   
-# 
-# saveRDS(corrlist, '/Users/wenpinhou/Dropbox/trajectory_variability/tree_variability/result/pm_oc.rds')   
-
-jsm <- do.call(rbind, jslist)
-ocm <- do.call(rbind, oclist)
-# par(mfrow = c(1,2))
-# hist(jsm)
-# hist(ocm)
-
-## moved within boostrap
-# reproduce <- corr.score <- list()
-# for (i in seq(1, length(jslist))){
-#   print(i)
-#   js <- jslist[[i]]
-#   js.binary <- get_binary(js)
-#   corr.score[[i]] <- corrlist[[i]] * js.binary
-#   js.melt <- melt(js.binary)
-#   js.melt <- js.melt[js.melt[,3]!=0,]
-#   colnames(js.melt) <- c('permutation.branch','original.branch','matched')
-#   reproduce[[i]] <- as.character(js.melt[,2])
-# }
-
-reproduce <- unlist(reproduce)  
-
-
-js.perc <- rep(0, length(newbranch))
-js.perc[as.numeric(names(table(reproduce)))] <-  table(reproduce)/n.permute
-names(js.perc) <- newbranch
-# saveRDS(js.perc, '/Users/wenpinhou/Dropbox/trajectory_variability/tree_variability/result/js_percentage.rds')
-
-corr.score.m <- do.call(rbind, corr.score)
-corr.score.v <- colSums(corr.score.m)/n.permute
-names(corr.score.v) <- newbranch
-# saveRDS(corr.score.v, '/Users/wenpinhou/Dropbox/trajectory_variability/tree_variability/result/corr_score.rds')
-
-res <- sapply(seq(1,length(oclist)), function(i){
-  print(i)
-  oc <- oclist[[i]]
-  oc.binary <- sapply(seq(1,ncol(oc)), function(c){
-    (oc[,c] > oc.cut[c]) + 0
-  })
-  while (length(which(rowSums(oc.binary) > 1)) > 0 | length(which(colSums(oc.binary) > 1)) > 0){
-    dup.id <- which(rowSums(oc.binary) > 1)
-    if (length(dup.id) == 1){
-      addid <- which.max(oc[dup.id, ])
-      oc.binary[dup.id, ] <- 0
-      oc.binary[dup.id, addid] <- 1  
-    } else if (length(dup.id) > 1) {
-      for (dup.i in dup.id){
-        addid <- which.max(oc[dup.i, ])
-        oc.binary[dup.i, ] <- 0
-        oc.binary[dup.i, addid] <- 1  
-      }
-    }
-    dup.id <- which(colSums(oc.binary) > 1)
-    if (length(dup.id) == 1){
-      addid <- which.max(oc[, dup.id])
-      oc.binary[, dup.id] <- 0
-      oc.binary[addid, dup.id] <- 1  
-    } else if (length(dup.id) > 1) {
-      for (dup.i in dup.id){
-        addid <- which.max(oc[, dup.i])
-        oc.binary[, dup.i] <- 0
-        oc.binary[addid, dup.i] <- 1  
-      }
-    }
-  }
-  oc.melt <- melt(oc.binary)
-  oc.melt <- oc.melt[oc.melt[,3]!=0,]
-  as.character(oc.melt[,2])
-})
-res <- unlist(res)  
-oc.perc <- rep(0, length(newbranch))
-oc.perc[as.numeric(names(table(res)))] <-  table(res)/n.permute
-names(oc.perc) <- newbranch
-sort((js.perc + oc.perc)/2)
-
-# saveRDS(oc.perc, '/Users/wenpinhou/Dropbox/trajectory_variability/tree_variability/result/oc_percentage.rds')
-
-detection.rate <- data.frame(detection.rate = (js.perc + oc.perc[names(js.perc)])/2, stringsAsFactors = FALSE)
-sample.cellcomp.mean <- apply(simplify2array(ctcomplist), 1:2, mean)
-sample.cellcomp.sd <- apply(simplify2array(ctcomplist), 1:2, sd)
-rownames(sample.cellcomp.mean) <- newbranch[as.numeric(sub('origin', '', rownames(sample.cellcomp.mean)))]
-rownames(sample.cellcomp.sd) <- newbranch[as.numeric(sub('origin', '', rownames(sample.cellcomp.sd)))]
-
-result <- list(detection.rate = detection.rate, 
-               sample.cellcomp.mean = sample.cellcomp.mean, 
-               sample.cellcomp.sd = sample.cellcomp.sd)
-
diff --git a/hca_bone_marrow_data_analysis/tree_variability/code/04_try_to_build_module_v2.R b/hca_bone_marrow_data_analysis/tree_variability/code/04_try_to_build_module_v2.R
deleted file mode 100644
index 2c6331c..0000000
--- a/hca_bone_marrow_data_analysis/tree_variability/code/04_try_to_build_module_v2.R
+++ /dev/null
@@ -1,404 +0,0 @@
-rm(list=ls())
-library(ggplot2)
-library(Seurat)
-library(reshape2)
-library(TSCAN)
-library(scattermore)
-library(RColorBrewer)
-suppressMessages(library(igraph))
-n.permute <- 3
-max.clunum <- 50
-setwd("/Users/wenpinhou/Dropbox/trajectory_variability/hca/data/HCA/proc/integrate")
-
-# --------------------------------------------------------------
-# input: seurat integrated object including:
-#  umap, pca
-# celltype: a dataframe, col 1 is cell name, col 2 is cell type
-# origin: the origin cell type
-# --------------------------------------------------------------
-# read in data
-umap = readRDS('umap.rds')
-pca <- as.matrix(umap@reductions$pca@cell.embeddings)
-# ctlevel <- data.frame(ct=c('HSC','MPP','LMPP','CMP','CLP','GMP','MEP',"Bcell","CD4Tcell","CD8Tcell",'NKcell','Mono','Ery'),level=c(1,2,3,3,4,4,4,5,5,5,5,5,5),immunepath=c(1,1,1,0,1,0,0,1,1,1,1,0,0),monopath=c(1,1,1,1,0,1,0,0,0,0,0,1,0),erypath=c(1,1,0,1,0,0,1,0,0,0,0,0,1),stringsAsFactors = F)
-str(pca)
-a = readRDS('/Users/wenpinhou/Dropbox/trajectory_variability/hca/data/HCA/proc/ct/sc.rds')
-ct = data.frame(cell = names(a), celltype = a, stringsAsFactors = FALSE)
-  
-mykmeans <- function(matrix, number.cluster = NA){
-  ## cluster the rows
-  set.seed(12345)
-  library(parallel)
-  if (is.na(number.cluster)){
-    maxclunum <- 20
-    rss <- mclapply(1:maxclunum,function(clunum) {
-      tmp <- kmeans(matrix,clunum,iter.max = 1000)
-      tmp$betweenss/tmp$totss
-    },mc.cores=20)
-    rss <- unlist(rss)
-    x <- 1:maxclunum
-    optclunum <- which.min(sapply(1:maxclunum, function(i) {
-        x2 <- pmax(0, x - i)
-        sum(lm(rss ~ x + x2)$residuals^2)  ## check this
-    }))
-    clu <- kmeans(matrix,optclunum)
-  } else {
-    clu <- kmeans(matrix, number.cluster)    
-  }
-    return(clu)
-}
-findbranch <- function(mst, order, origin){
-  deg <- degree(mst)
-  vertex <- names(deg[which(deg > 2 | deg == 1)])
-  if (!origin %in% vertex) vertex <- c(origin, vertex)
-  eg <- expand.grid(1:length(vertex), 1:length(vertex))
-  eg <- eg[eg[,1]<eg[,2],]
-  eg = data.frame(vertex[eg[,1]], vertex[eg[,2]], stringsAsFactors = FALSE)
-  library(igraph)
-  tmpbranch <- lapply(seq(1,nrow(eg)), function(i){
-    sp <- shortest_paths(mst, from = eg[i,1], to = eg[i,2])$vpath[[1]]
-    if (sum(vertex %in% sp) == 2) as.vector(sp)
-  })
-  tmpbranch <- tmpbranch[sapply(tmpbranch, length) >0]  
- 
-  allbranch <- gsub('backbone ', '', gsub('branch: ', '', names(order)))
-  allbranch <- sapply(allbranch, function(i) strsplit(i, ',')[[1]])
-  allbranch <- paste0(names(allbranch), collapse = ' ')
-  newbranch <-sapply(tmpbranch, function(i) {
-      tmp <- paste0(i, collapse = ',')
-      if (!grepl(tmp, allbranch)){
-        rev(i)
-      } else {
-        i
-      }
-  })
-  return(newbranch)
-}
-get_binary <- function(matrix, matrix.cut){
-  ## match boostrap and origin branches.
-  ## matrix: #boostrap.branch * #origin.branch, values are js or oc
-  ## matrix.cut: js or oc null distribution cutoff 
-  matrix.binary <- sapply(seq(1,ncol(matrix)), function(c){
-    (matrix[,c] > matrix.cut[c]) + 0
-  })
-  while (length(which(rowSums(matrix.binary) > 1)) > 0 | length(which(colSums(matrix.binary) > 1)) > 0){
-    dup.id <- which(rowSums(matrix.binary) > 1)
-    if (length(dup.id) == 1){
-      addid <- which.max(matrix[dup.id, ])
-      matrix.binary[dup.id, ] <- 0
-      matrix.binary[dup.id, addid] <- 1  
-    } else if (length(dup.id) > 1) {
-      for (dup.i in dup.id){
-        addid <- which.max(matrix[dup.i, ])
-        matrix.binary[dup.i, ] <- 0
-        matrix.binary[dup.i, addid] <- 1  
-      }
-    }
-      
-    dup.id <- which(colSums(matrix.binary) > 1)
-    if (length(dup.id) == 1){
-      addid <- which.max(matrix[, dup.id])
-      matrix.binary[, dup.id] <- 0
-      matrix.binary[addid, dup.id] <- 1  
-    } else if (length(dup.id) > 1) {
-      for (dup.i in dup.id){
-        addid <- which.max(matrix[, dup.i])
-        matrix.binary[, dup.i] <- 0
-        matrix.binary[addid, dup.i] <- 1  
-      }
-    }
-  }
-  return(matrix.binary)
-}
-
-### determine numPC
-infer_tree_structure <- function(pca, ct, origin.celltype){
-  alls <- sub(':.*', '', ct$cell)
-  names(alls) <- ct$cell
-  set.seed(12345)
-  sdev <- apply(pca, 2, sd)
-  x <- 1:max.clunum
-  optpoint <- which.min(sapply(2:max.clunum, function(i) {
-    x2 <- pmax(0, x - i)
-    sum(lm(sdev[1:max.clunum] ~ x + x2)$residuals^2)
-  }))
-  pcadim = optpoint + 1
-  pr <- pca[,1:pcadim]  # 7
-  
-  ## clustering
-  clu <- mykmeans(pr, number.cluster = 14)$cluster
-  # pd = data.frame(x = pr[,1], y = pr[,2], clu = as.factor(clu[rownames(pr)]))
-  # mypalette = colorRampPalette(brewer.pal(9,'Set1'))
-  # ggplot(data = pd, aes(x = x, y = y, color = clu)) + 
-  #   geom_scattermore()+
-  #   scale_color_manual(values = mypalette(14))+
-  #   theme_classic() + xlab('UMAP1') + ylab('UMAP2')
-  
-  # ## cell type composition in clusters
-  # pd = cbind(pd, celltype = ct[match(rownames(pd), ct[,1]),2])
-  # tab <- table(pd[,3:4])
-  # tab <- tab/rowSums(tab)
-  # pd <- melt(tab)
-  # pd$clu <- factor(as.character(pd$clu), levels = seq(1,max(pd$clu)))
-  # 
-  # ggplot(data = pd) +
-  #   geom_bar(aes(x = clu, y = value, fill = celltype), stat = 'identity', position = 'dodge') +
-  #   theme_classic() +
-  #   ylab('Celltype Proportion') +
-  #   scale_fill_manual(values = mypalette(length(unique(pd$celltype))))
-  
-  ### mclust
-  mcl <- exprmclust(t(pr),cluster=clu,reduce=F)
-  # mcl <- exprmclust(t(pr), reduce = F)
-  # plotmclust(mcl, cell_point_size = 0.1)
-  # str(mcl)
-  
-  # --------------------
-  # construct pseudotime 
-  # --------------------
-  ## find origin
-  pd = data.frame(x = pr[,1], y = pr[,2], clu = as.factor(mcl$clusterid))
-  pd = cbind(pd, celltype = ct[match(rownames(pd), ct[,1]),2])
-  tab <- table(pd[,3:4])
-  tab <- tab/rowSums(tab)
-  pd <- melt(tab)
-  pd$clu <- factor(as.character(pd$clu), levels = seq(1,max(pd$clu)))
-  tmp <- pd[pd$celltype == origin.celltype, ]
-  origin.cluster <- as.numeric(tmp[which.max(tmp[,3]), 1])
-  
-  ## construct pseudotime
-  ord <- TSCANorder(mcl, startcluster = origin.cluster, listbranch = T,orderonly = T)
-  str(ord)
-  length(ord)
-  pt <- unlist(sapply(sapply(ord, length), function(i) seq(1, i)))
-  names(pt) <- unname(unlist(ord))
-  
-  # ## plot pseudotime
-  # pd = data.frame(pc1 = pca[,1], pc2 = pca[,2], time = as.numeric(pt[rownames(pca)]))
-  # library(scattermore)
-  # library(RColorBrewer)
-  # ggplot(data = pd, aes(x = pc1, y = pc2, color = time)) +
-  #   geom_scattermore() +
-  #   scale_color_gradient(low = 'yellow', high = 'blue')
-  
-  # ------------------------------------------------------------
-  # get candidate branches to test reproducibility, 20200726 >>
-  # ------------------------------------------------------------
-  
-  newbranch <- findbranch(mst = mcl$MSTtree, order = ord, origin = origin.cluster)  
-  
-  # -----------------------------------------------------
-  # Evaluate robustness of tree branches using resampling
-  # -----------------------------------------------------
-  
-  # null distribution of Jaccard index, overlap coefficient
-  
-  js.null <- lapply(seq(1, length(newbranch)), function(i) {
-    b.ori <- unlist(sapply(newbranch[[i]], function(c) names(mcl$clusterid[mcl$clusterid == c])))
-    tmp <- sapply(seq(1, 1e3), function(j){
-      set.seed(j)
-      b.pm <- sample(rownames(pr), length(b.ori))
-      length(intersect(b.pm, b.ori))/length(union(b.pm, b.ori))
-    })
-  })
-  
-  # par(mfrow = c(2,ceiling(length(js.null)/2)))
-  # for (i in js.null) hist(i, xlab = 'js', main = '', breaks = 50)
-  
-  js.cut <- sapply(js.null, quantile, 0.99)
-  
-  oc.null <- lapply(seq(1, length(newbranch)), function(i){
-    b.ori <- unlist(sapply(newbranch[[i]], function(c) names(mcl$clusterid[mcl$clusterid == c])))
-    tmp <- sapply(seq(1, 1e3), function(j){
-      set.seed(j)
-      b.pm <- sample(rownames(pr), length(b.ori))
-      length(intersect(b.pm, b.ori))/min(length(b.pm), length(b.ori))
-    })
-  })
-  # par(mfrow = c(2,ceiling(length(oc.null)/2)))
-  # for (i in oc.null) hist(i, xlab = 'oc', main = '', breaks = 50)
-  oc.cut <- sapply(oc.null, quantile, 0.99)
-  
-  mcl$pseudotime <- pt
-  mcl$branch <- newbranch
-  mcl$js.cut <- js.cut
-  mcl$oc.cut <- oc.cut
-  mcl$pca <- pr
-  mcl$order <- ord
-  mcl$allsample <- alls
-  return(mcl)
-}
-# permutation 
-
-a = infer_tree_structure(pca = pca, ct = ct, origin.celltype = 'HSC')
-
-evaluate_uncertainty <- function(inferobj, n.permute){
-  pr <- inferobj$pca
-  newbranch <- inferobj$branch
-  js.cut <- inferobj$js.cut
-  oc.cut <- inferobj$oc.cut 
-  pt <- inferobj$pseudotime
-  ord <- inferobj$order
-  alls <- inferobj$allsample
-  ctcomplist <- reproduce.js <- reproduce.oc <- corr.score <- list()
-  for (pmid in seq(1, n.permute)){
-    ## boostrap cells
-    print(pmid)
-    set.seed(pmid)
-    pr.pm <- pr[sample(rownames(pr), nrow(pr), replace = TRUE),]
-    pr.pm <- pr.pm[!duplicated(rownames(pr.pm)),]
-    
-    ## cluster cells
-    clu <- mykmeans(pr.pm, number.cluster = 14)$cluster ###
-  
-    # --- check if these codes are necessary <<<<<<<<<<<<<<<<
-    # pd = data.frame(x = pr[names(clu),1], y = pr[names(clu),2], clu = as.factor(clu))
-    # pd.text.x = tapply(pd[,1], list(pd$clu), mean)
-    # pd.text.y = tapply(pd[,2], list(pd$clu), mean)
-    # pd.text = data.frame(x = pd.text.x, y = pd.text.y, clu = names(pd.text.x))
-    # pd.text[14,1:2] =  c(pd.text[14,1] + 2, pd.text[14,2] + 1)
-    # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>
-    # ggplot() + 
-    #   geom_scattermore(data = pd, aes(x = x, y = y, color = clu))+
-    #   scale_color_manual(values = mypalette(14))+
-    #   theme_classic() + xlab('UMAP1') + ylab('UMAP2') +
-    #   geom_text(data = pd.text, aes(x = x, y = y, label = clu))
-    
-  
-    ## cell type composition in clusters
-    # pd = cbind(pd, celltype = ct[match(rownames(pd), ct[,1]),2])
-    # tab <- table(pd[,3:4])
-    # tab <- tab/rowSums(tab)
-    # pd <- melt(tab)
-    # pd$clu <- factor(as.character(pd$clu), levels = seq(1,max(pd$clu)))
-    # ggplot(data = pd) +
-    #   geom_bar(aes(x = clu, y = value, fill = celltype), stat = 'identity', position = 'dodge') +
-    #   theme_classic() +
-    #   ylab('Celltype Proportion') +
-    #   scale_fill_manual(values = mypalette(length(unique(pd$celltype))))
-  
-    ## build pseudotime
-    mcl.pm <- exprmclust(t(pr.pm), cluster = clu, reduce = FALSE) ###
-    # plotmclust(mcl.pm, cell_point_size = 0.1)
-    
-    ## select origin cluster
-    pt.pm.mean<- tapply(pt[names(mcl.pm[['clusterid']])], list(mcl.pm[['clusterid']]), mean)
-    start.cluster <- names(which.min(pt.pm.mean))
-    
-    ## construct pseudotime
-    ord.pm <- TSCANorder(mcl.pm, startcluster = start.cluster, listbranch = T,orderonly = T)
-    # str(ord.pm)
-    
-    pt.pm <- unlist(sapply(sapply(ord.pm, length), function(i) seq(1, i)))
-    names(pt.pm) <- unname(unlist(ord.pm))
-    # --- check if these codes are necessary <<<<<<<<<<<<<<<<
-    ## plot pseudotime
-    
-    pd = data.frame(pc1 = pca[,1], pc2 = pca[,2], time = as.numeric(pt.pm[rownames(pca)]))
-    # ggplot(data = pd, aes(x = pc1, y = pc2, color = time)) +
-    #   geom_scattermore() + theme_classic()
-    # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>
-    
-    # get candidate branches
-    newbranch.pm <- findbranch(mst = mcl.pm$MSTtree, order = ord.pm, origin = start.cluster)
-    
-    ## compare two MST
-    js <- sapply(seq(1, length(newbranch)), function(i){
-            id <- which(sapply(paste0(names(ord),','), function(k) grepl(paste0(paste0(newbranch[[i]], collapse = ','),','), k)))[1]
-            cells <- ord[[id]]
-            b.ori <- intersect(unlist(sapply(newbranch[[i]], function(k) names(inferobj$clusterid)[inferobj$clusterid == k])), cells)
-            sapply(seq(1, length(newbranch.pm)), function(j){
-              print(j)
-              id <- which(sapply(paste0(names(ord.pm),','), function(k) grepl(paste0(paste0(newbranch.pm[[j]], collapse = ','),','), k)))[1]
-              cells <- ord.pm[[id]]
-              b.pm <- intersect(unlist(sapply(newbranch.pm[[j]], function(k) names(mcl.pm$clusterid)[mcl.pm$clusterid == k])), cells)
-              js <- length(intersect(b.pm, b.ori))/length(union(b.pm, b.ori))
-            })
-          })
-    oc <- sapply(seq(1, length(newbranch)), function(i){
-              id <- which(sapply(paste0(names(ord),','), function(k) grepl(paste0(paste0(newbranch[[i]], collapse = ','),','), k)))[1]
-              cells <- ord[[id]]
-              b.ori <- intersect(unlist(sapply(newbranch[[i]], function(k) names(inferobj$clusterid)[inferobj$clusterid == k])), cells)
-              sapply(seq(1, length(newbranch.pm)), function(j){
-                  id <- which(sapply(paste0(names(ord.pm),','), function(k) grepl(paste0(paste0(newbranch.pm[[j]], collapse = ','),','), k)))[1]
-                  cells <- ord.pm[[id]]
-                  b.pm <- intersect(unlist(sapply(newbranch.pm[[j]], function(k) names(mcl.pm$clusterid)[mcl.pm$clusterid == k])), cells)
-                  oc <- length(intersect(b.pm, b.ori))/min(length(b.pm), length(b.ori))
-             }) 
-          })
-    corr <- sapply(seq(1, length(newbranch)), function(i){
-                id <- which(sapply(paste0(names(ord),','), function(k) grepl(paste0(paste0(newbranch[[i]], collapse = ','),','), k)))[1]
-                cells <- ord[[id]]
-                b.ori <- intersect(unlist(sapply(newbranch[[i]], function(k) names(inferobj$clusterid)[inferobj$clusterid == k])), cells)
-                
-                sapply(seq(1, length(newbranch.pm)), function(j){
-                    id <- which(sapply(paste0(names(ord.pm),','), function(k) grepl(paste0(paste0(newbranch.pm[[j]], collapse = ','),','), k)))[1]
-                    cells <- ord.pm[[id]]
-                    b.pm <- intersect(unlist(sapply(newbranch.pm[[j]], function(k) names(mcl.pm$clusterid)[mcl.pm$clusterid == k])), cells)
-                    ov = intersect(b.ori, b.pm)
-                    cor(pt[ov], pt.pm[ov])
-                }) 
-            })
-    corr[is.na(corr)] <- 0
-    colnames(corr) <- colnames(oc) <- colnames(js) <- paste0('original', seq(1, length(newbranch)))
-    
-    ## get js binary to match branches 
-    js.binary <- get_binary(js, js.cut)
-    corr.score[[pmid]] <- corr * js.binary
-    js.melt <- melt(js.binary)
-    js.melt <- js.melt[js.melt[,3]!=0,]
-    colnames(js.melt) <- c('permutation.branch','original.branch','matched')
-    reproduce.js[[pmid]] <- as.character(js.melt[,2])
-    
-    ## get oc binary to match branches
-    oc.binary <- get_binary(oc, oc.cut)
-    oc.melt <- melt(oc.binary)
-    oc.melt <- oc.melt[oc.melt[,3]!=0,]
-    reproduce.oc[[pmid]] <- as.character(oc.melt[,2])
-  
-    ## samples cell compositions 
-    ctcomp <- sapply(js.melt[,2], function(tmp){
-      c <- names(clu)[clu %in% newbranch.pm[[tmp]]]
-      ctcomp <- rep(0, length(unique(alls)))
-      names(ctcomp) <- unique(alls)
-      ctcomp[names(table(alls[c]))] <- table(alls[c])
-    })
-    colnames(ctcomp) <- paste0('origin', js.melt[,2])
-    ctcomp <- ctcomp/rowSums(ctcomp)
-    
-    ctcomp.new <- matrix(0, nrow = length(unique(alls)), ncol = length(newbranch))
-    colnames(ctcomp.new) <- paste0('origin', seq(1, length(newbranch)))
-    rownames(ctcomp.new) <- unique(alls)
-    ctcomp.new[rownames(ctcomp), colnames(ctcomp)] <- ctcomp
-    ctcomplist[[pmid]] <- t(ctcomp.new)
-  }
-  
-  reproduce.js <- unlist(reproduce.js)  
-  js.perc <- rep(0, length(newbranch))
-  js.perc[as.numeric(names(table(reproduce.js)))] <-  table(reproduce.js)/n.permute
-  names(js.perc) <- newbranch
-  
-  reproduce.oc <- unlist(reproduce.oc)  
-  oc.perc <- rep(0, length(newbranch))
-  oc.perc[as.numeric(names(table(reproduce.oc)))] <-  table(reproduce.oc)/n.permute
-  names(oc.perc) <- newbranch
-  
-  corr.score.m <- do.call(rbind, corr.score)
-  corr.score.v <- colSums(corr.score.m)/n.permute
-  names(corr.score.v) <- newbranch
-  
-  sort((js.perc + oc.perc)/2)
-  
-  detection.rate <- data.frame(detection.rate = (js.perc + oc.perc[names(js.perc)])/2, stringsAsFactors = FALSE)
-  sample.cellcomp.mean <- apply(simplify2array(ctcomplist), 1:2, mean)
-  sample.cellcomp.sd <- apply(simplify2array(ctcomplist), 1:2, sd)
-  rownames(sample.cellcomp.mean) <- newbranch[as.numeric(sub('origin', '', rownames(sample.cellcomp.mean)))]
-  rownames(sample.cellcomp.sd) <- newbranch[as.numeric(sub('origin', '', rownames(sample.cellcomp.sd)))]
-  
-  result <- list(detection.rate = detection.rate, 
-                 sample.cellcomp.mean = sample.cellcomp.mean, 
-                 sample.cellcomp.sd = sample.cellcomp.sd)
-  return(result)
-}
-
-result <- evaluate_uncertainty(a, 3)
-