diff --git a/hca_bone_marrow_data_analysis/geneexpr/code/01_f.R b/hca_bone_marrow_data_analysis/geneexpr/code/01_f.R deleted file mode 100644 index 7f6341d..0000000 --- a/hca_bone_marrow_data_analysis/geneexpr/code/01_f.R +++ /dev/null @@ -1,14 +0,0 @@ -setwd('/home-4/whou10@jhu.edu/scratch/Wenpin/trajectory_variability/') -order = readRDS('./hca/result/ery/order.rds') -mat = readRDS('./hca/data/HCA/proc/matrix/saver.rds') -mat = mat[,order$Cell] -source('./function/01_function.R') -order = data.frame(order, Patient = gsub('_.*','', order$Cell)) -ap = as.character(unique(order$Patient)) -g1 = ap[grepl('female', ap)] -g2 = ap[grepl(':male', ap)] -f_gene = f_statistics_from_gene(mat, order, g1, g2) -saveRDS(f_gene,'./hca/geneexpr/result/f_statistics_from_gene_gender.rds') - -a = f_statistics_from_gene_permute(mat, order, g1, g2, num.permute=1e4) -saveRDS(a,'./hca/geneexpr/result/f_statistics_from_gene_gender_permute.rds') diff --git a/hca_bone_marrow_data_analysis/geneexpr/code/01_f2.R b/hca_bone_marrow_data_analysis/geneexpr/code/01_f2.R deleted file mode 100644 index f9a2993..0000000 --- a/hca_bone_marrow_data_analysis/geneexpr/code/01_f2.R +++ /dev/null @@ -1,15 +0,0 @@ -setwd('/home-4/whou10@jhu.edu/scratch/Wenpin/trajectory_variability/') -order = readRDS('./hca/result/ery/order.rds') -mat = readRDS('./hca/data/HCA/proc/matrix/saver.rds') -mat = mat[,order$Cell] -source('./function/01_function.R') -order = data.frame(order, Patient = gsub('_.*','', order$Cell)) -ap = as.character(unique(order$Patient)) -g1 = ap[grepl('female', ap)] -g2 = ap[grepl(':male', ap)] -f_gene = f_statistics_from_gene(mat, order, g1, g2) -saveRDS(f_gene,'./hca/geneexpr/result/f_statistics_from_gene_gender.rds') - -mat <- mat[rowMeans(mat>0.01)>0.1, ] -a = f_statistics_from_gene_permute(mat, order, g1, g2, num.permute=1e4) -saveRDS(a,'./hca/geneexpr/result/f_statistics_from_lowExprGene_gender_permute.rds') diff --git a/hca_bone_marrow_data_analysis/geneexpr/code/01_f3.R b/hca_bone_marrow_data_analysis/geneexpr/code/01_f3.R deleted file mode 100644 index b8338b1..0000000 --- a/hca_bone_marrow_data_analysis/geneexpr/code/01_f3.R +++ /dev/null @@ -1,14 +0,0 @@ -setwd('/home-4/whou10@jhu.edu/scratch/Wenpin/trajectory_variability/') -order = readRDS('./hca/result/ery/order.rds') -mat = readRDS('./hca/data/HCA/proc/matrix/saver.rds') -mat = mat[,order$Cell] -source('./function/01_function.R') -order = data.frame(order, Patient = gsub('_.*','', order$Cell)) -ap = as.character(unique(order$Patient)) -g1 = ap[grepl('female', ap)] -g2 = ap[grepl(':male', ap)] -f_gene = f_statistics_from_gene(mat, order, g1, g2) -saveRDS(f_gene,'./hca/geneexpr/result/f_statistics_from_gene_gender.rds') -mat <- mat[rowMeans(mat>0.01)>0.1, ] -a = f_statistics_from_gene_permute(mat, order, g1, g2, num.permute=1e4) -saveRDS(a,'./hca/geneexpr/result/f_statistics_from_lowExprGene_gender_permute_new1e4.rds') diff --git a/hca_bone_marrow_data_analysis/geneexpr/code/02_f_age.R b/hca_bone_marrow_data_analysis/geneexpr/code/02_f_age.R deleted file mode 100644 index 4f45783..0000000 --- a/hca_bone_marrow_data_analysis/geneexpr/code/02_f_age.R +++ /dev/null @@ -1,16 +0,0 @@ -setwd('/home-4/whou10@jhu.edu/scratch/Wenpin/trajectory_variability/') -order = readRDS('./hca/result/ery/order.rds') -mat = readRDS('./hca/data/HCA/proc/matrix/saver.rds') -mat = mat[,order$Cell] -source('./function/01_function.R') -order = data.frame(order, Patient = gsub('_.*','', order$Cell)) -ap = as.character(unique(order$Patient)) -age = as.numeric(sapply(ap, function(i) strsplit(i,':')[[1]][2])) -g1 = ap[order(age)[1:4]] -g2 = ap[order(age)[5:8]] -f_gene = f_statistics_from_gene(mat, order, g1, g2) -saveRDS(f_gene,'./hca/geneexpr/result/f_statistics_from_gene_age.rds') - -mat <- mat[rowMeans(mat>0.01)>0.1, ] -a = f_statistics_from_gene_permute(mat, order, g1, g2, num.permute=1e4) -saveRDS(a,'./hca/geneexpr/result/f_statistics_from_lowExprGene_age_permute.rds') diff --git a/hca_bone_marrow_data_analysis/geneexpr/code/02_f_age3.R b/hca_bone_marrow_data_analysis/geneexpr/code/02_f_age3.R deleted file mode 100644 index 7326ae4..0000000 --- a/hca_bone_marrow_data_analysis/geneexpr/code/02_f_age3.R +++ /dev/null @@ -1,16 +0,0 @@ -setwd('/home-4/whou10@jhu.edu/scratch/Wenpin/trajectory_variability/') -order = readRDS('./hca/result/ery/order.rds') -mat = readRDS('./hca/data/HCA/proc/matrix/saver.rds') -mat = mat[,order$Cell] -source('./function/01_function.R') -order = data.frame(order, Patient = gsub('_.*','', order$Cell)) -ap = as.character(unique(order$Patient)) -age = as.numeric(sapply(ap, function(i) strsplit(i,':')[[1]][2])) -g1 = ap[order(age)[1:4]] -g2 = ap[order(age)[5:8]] -f_gene = f_statistics_from_gene(mat, order, g1, g2) -saveRDS(f_gene,'./hca/geneexpr/result/f_statistics_from_gene_age.rds') - -mat <- mat[rowMeans(mat>0.01)>0.1, ] -a = f_statistics_from_gene_permute(mat, order, g1, g2, num.permute=1e4) -saveRDS(a,'./hca/geneexpr/result/f_statistics_from_lowExprGene_age_permute_new1e4.rds') diff --git a/hca_bone_marrow_data_analysis/geneexpr/plot/01_plot.R b/hca_bone_marrow_data_analysis/geneexpr/plot/01_plot.R deleted file mode 100644 index 3c46aa0..0000000 --- a/hca_bone_marrow_data_analysis/geneexpr/plot/01_plot.R +++ /dev/null @@ -1,144 +0,0 @@ -rm(list=ls()) -setwd('/home-4/whou10@jhu.edu/scratch/Wenpin/trajectory_variability/') -# setwd('/Users/wenpinhou/Dropbox/trajectory_variability/') -order = readRDS('./hca/result/ery/order.rds') -mat = readRDS('./hca/data/HCA/proc/matrix/saver.rds') -mat = mat[,order$Cell] -mat <- mat[rowMeans(mat>0.01)>0.1, ] -source('./function/01_function.R') -order = data.frame(order, Patient = gsub('_.*','', order$Cell)) -ap = as.character(unique(order$Patient)) -g1 = ap[grepl('female', ap)] -g2 = ap[grepl(':male', ap)] -source('/home-4/whou10@jhu.edu/scratch/Wenpin/resource/function.R') -# source('/Users/wenpinhou/Dropbox/resource/function.R') -eg <- sapply(ap, function(p){ - print(p) - tmat <- mat[,grepl(p, colnames(mat))] - rownames(tmat)[rowMeans(tmat>0.01)>0.1] -}) -eg <- unique(unlist(eg)) -mat = mat[eg,] -vg <- sapply(ap, function(p){ - print(p) - tmat <- mat[,grepl(p, colnames(mat))] - vg <- findVariableGene(tmat, num.gene = NULL ,plot.statistics=TRUE, plot.dir = paste0('./hca/geneexpr/plot.mac/',p,'/')) -}) -vg <- unique(unlist(vg)) -mat = mat[vg,] -b = readRDS('./hca/geneexpr/result/f_statistics_from_gene_gender.rds') -a = readRDS('./hca/geneexpr/result/f_statistics_from_lowExprGene_gender_permute_new1e4.rds') -a = a[rownames(mat),] -b = b[rownames(a)] -pval <- sapply(seq(1,nrow(a)), function(i){ - sum(a[i,]>b[i])/ncol(a) -}) -names(pval) = rownames(a) -fdr = p.adjust(pval,method='fdr') - -# ag <- names(sort(pval)[1:16]) -ag <- names(sort(b, decreasing=TRUE)[1:16]) -library(ggplot2) -library(gridExtra) -plist <- list() -for (g in ag){ - print(g) - pd1 = mat[g, grepl('female', colnames(mat))] - pd1 = data.frame(Expr=pd1, Cell=names(pd1), Patient = gsub('_.*','',names(pd1) ), Gender='Female') - pd2 = mat[g, grepl(':male', colnames(mat))] - pd2 = data.frame(Expr=pd2, Cell=names(pd2), Patient = gsub('_.*','',names(pd2) ), Gender='Male') - pd = rbind(pd1, pd2) - pd = cbind(pd, Pseudotime = order[match(pd$Cell, order$Cell),'Pseudotime']) - linedlist <- lapply(unique(pd$Patient), function(p){ - tmat = mat[g,grepl(p,colnames(mat)),drop=F] - trainX = order$Pseudotime[grepl(p,colnames(mat))] - pred <- get_spline_fit(tmat, trainX=seq(1,ncol(tmat)), fit.min=min(order$Pseudotime), fit.max=max(order$Pseudotime)) - tmpdf <- data.frame(Expr=pred[1,], Pseudotime=trainX, Patient=p, Gender=ifelse(grepl('female',p),'female','male')) - }) - ld = do.call(rbind, linedlist) - plist[[g]] <- ggplot() + geom_point(data=pd, aes(x=Pseudotime, y=Expr, color=Patient), alpha=.1, size=.2) + - geom_line(data=ld, aes(x=Pseudotime, y=Expr, color=Patient),alpha=1, size=.5) + - theme_classic() + ggtitle(paste0(sub(':.*','',g),',p=', round(pval[g],3),',f=',round(b[g],2))) + theme(legend.position = 'none') + scale_color_manual(values=c(rep('darkblue',4),rep('orange',4))) - } -pdf('./hca/geneexpr/plot.mac/gender_diff_gene_top_f.pdf',width=12,height=9) -# pdf('./hca/geneexpr/plot.mac/gender_diff_gene_top_pval.pdf',width=12,height=9) -grid.arrange(grobs=plist,nrow=4) -dev.off() - - -############# plot order permutation result -u1 = readRDS('/home-4/whou10@jhu.edu/scratch/Wenpin/resource/chrX_genename.rds') -u2 = readRDS('/home-4/whou10@jhu.edu/scratch/Wenpin/resource/chrY_genename.rds') -allg = sub(':.*','',names(sort(pval))) -str(allg) - -v1 <- sapply(seq(1,length(allg)), function(i){ - sum(allg[seq(1,i)] %in% u1) -}) - -v2 <- sapply(seq(1,length(allg)), function(i){ - sum(allg[seq(1,i)] %in% u2) -}) -v1_pm <- sapply(seq(1,1e2), function(myseed){ - set.seed(myseed) - w1 = sample(allg, length(u1)) - v1 <- sapply(seq(1,length(allg)), function(i){ - sum(allg[seq(1,i)] %in% w1) - }) -}) -rownames(v1_pm) <- paste0('top',seq(1,nrow(v1_pm))) -saveRDS(v1_pm, './hca/geneexpr/result/geneset_same_length_as_chrX_gene_pm_mean_order.rds') -v1_pm <- rowMeans(v1_pm) -v2_pm <- sapply(seq(1, 1e2), function(myseed){ - print(myseed) - set.seed(myseed) - w2 = sample(allg, length(u2)) - v2 <- sapply(seq(1,length(allg)), function(i){ - sum(allg[seq(1,i)] %in% w2) - }) -}) -rownames(v2_pm) <- paste0('top',seq(1,nrow(v2_pm))) -saveRDS(v2_pm, './hca/geneexpr/result/geneset_same_length_as_chrY_gene_pm_mean_order.rds') -v2_pm <- rowMeans(v2_pm) -df = data.frame(chrX=v1, chrY=v2, chrX_pm = v1_pm, chrY_pm = v2_pm, order = seq(1,length(v1))) -saveRDS(df, './hca/geneexpr/result/df_chrX_chrY_pm_order.rds') -mat <- NULL -for (i in 1:4) { - mat <- rbind(mat,data.frame(v=df[,i],order=df[,5],type=colnames(df)[i])) -} -library(ggplot2) -pdf('./hca/geneexpr/plot.mac/chrX_chrY_order_compare_to_permutation.pdf', width=4, height=4) -ggplot(mat,aes(x=order,y=v,col=type, fill=type), alpha=.2) + geom_line() + xlim(c(0,30)) + ylim(c(0,10))+theme_classic()+ylab('number of ChrX/Y genes') + xlab('top n genes (ordered by increasing pvalue)') -dev.off() - -## all chrX + chrY -u = unique(c(u1,u2)) -v <- sapply(seq(1,length(allg)), function(i){ - sum(allg[seq(1,i)] %in% u) -}) -v_pm <- sapply(seq(1, 1e2), function(myseed){ - print(myseed) - set.seed(myseed) - w = sample(allg, length(u)) - v <- sapply(seq(1,length(allg)), function(i){ - sum(allg[seq(1,i)] %in% w) - }) -}) -rownames(v_pm) <- paste0('top',seq(1,nrow(v_pm))) -saveRDS(v_pm, './hca/geneexpr/result/geneset_same_length_as_chrXY_gene_pm_mean_order.rds') -v_pm = rowMeans(v_pm) - -df = data.frame(chrXY = v, chrXY_pm = v_pm) -saveRDS(df, './hca/geneexpr/result/df_chrXY_pm_order.rds') -mat <- NULL -for (i in 1:2) { - mat <- rbind(mat,data.frame(v=df[,i],order=seq(1,nrow(df)),type=colnames(df)[i])) -} - -pdf('./hca/geneexpr/plot.mac/chrXY_order_compare_to_permutation.pdf', width=4, height=4) -ggplot(mat,aes(x=order,y=v,col=type, fill=type), alpha=.2) + geom_line() + xlim(c(0,30)) + ylim(c(0,10))+theme_classic()+ylab('number of ChrX/Y genes') + xlab('top n genes (ordered by increasing pvalue)') -dev.off() - - - - diff --git a/hca_bone_marrow_data_analysis/geneexpr/plot/02_plot_age.R b/hca_bone_marrow_data_analysis/geneexpr/plot/02_plot_age.R deleted file mode 100644 index bdb624f..0000000 --- a/hca_bone_marrow_data_analysis/geneexpr/plot/02_plot_age.R +++ /dev/null @@ -1,73 +0,0 @@ -rm(list=ls()) -setwd('/home-4/whou10@jhu.edu/scratch/Wenpin/trajectory_variability/') -# setwd('/Users/wenpinhou/Dropbox/trajectory_variability/') -order = readRDS('./hca/result/ery/order.rds') -mat = readRDS('./hca/data/HCA/proc/matrix/saver.rds') -mat = mat[,order$Cell] -mat <- mat[rowMeans(mat>0.01)>0.1, ] -source('./function/01_function.R') -order = data.frame(order, Patient = gsub('_.*','', order$Cell)) -ap = as.character(unique(order$Patient)) -g1 = ap[grepl('female', ap)] -g2 = ap[grepl(':male', ap)] -# source('/Users/wenpinhou/Dropbox/resource/function.R') -source('/home-4/whou10@jhu.edu/scratch/Wenpin/resource/function.R') -eg <- sapply(ap, function(p){ - print(p) - tmat <- mat[,grepl(p, colnames(mat))] - rownames(tmat)[rowMeans(tmat>0.01)>0.1] -}) -eg <- unique(unlist(eg)) -mat = mat[eg,] -vg <- sapply(ap, function(p){ - print(p) - tmat <- mat[,grepl(p, colnames(mat))] - vg <- findVariableGene(tmat, num.gene = NULL ,plot.statistics=TRUE, plot.dir = paste0('./hca/geneexpr/plot.mac/',p,'/')) -}) -vg <- unique(unlist(vg)) -mat = mat[vg,] -b = readRDS('./hca/geneexpr/result/f_statistics_from_gene_age.rds') -a = readRDS('./hca/geneexpr/result/f_statistics_from_lowExprGene_age_permute_new1e4.rds') -a = a[rownames(mat),] -b = b[rownames(a)] -pval <- sapply(seq(1,nrow(a)), function(i){ - sum(a[i,]>b[i])/ncol(a) -}) -names(pval) = rownames(a) -fdr = p.adjust(pval,method='fdr') -pdf('./hca/geneexpr/plot.mac/age_diff_f_p_fdr.pdf', width=7,height=4) -par(mfrow=c(1,2)) -smoothScatter(pval~b, xlab='f statistics', ylab='p-value') -smoothScatter(fdr~b, xlab='f statistics', ylab='fdr') -dev.off() -ag <- names(sort(pval)[1:16]) -# ag <- names(sort(b, decreasing=TRUE)[1:16]) -library(ggplot2) -library(gridExtra) -plist <- list() -for (g in ag){ - print(g) - pd1 = mat[g, grepl('female', colnames(mat))] - pd1 = data.frame(Expr=pd1, Cell=names(pd1), Patient = gsub('_.*','',names(pd1) ), Gender='Female') - pd2 = mat[g, grepl(':male', colnames(mat))] - pd2 = data.frame(Expr=pd2, Cell=names(pd2), Patient = gsub('_.*','',names(pd2) ), Gender='Male') - pd = rbind(pd1, pd2) - pd = cbind(pd, Pseudotime = order[match(pd$Cell, order$Cell),'Pseudotime']) - linedlist <- lapply(unique(pd$Patient), function(p){ - tmat = mat[g,grepl(p,colnames(mat)),drop=F] - trainX = order$Pseudotime[grepl(p,colnames(mat))] - pred <- get_spline_fit(tmat, trainX=seq(1,ncol(tmat)), fit.min=min(order$Pseudotime), fit.max=max(order$Pseudotime)) - tmpdf <- data.frame(Expr=pred[1,], Pseudotime=trainX, Patient=p, Gender=ifelse(grepl('female',p),'female','male')) - }) - ld = do.call(rbind, linedlist) - plist[[g]] <- ggplot() + geom_point(data=pd, aes(x=Pseudotime, y=Expr, color=Patient), alpha=.1, size=.2) + - geom_line(data=ld, aes(x=Pseudotime, y=Expr, color=Patient),alpha=1, size=.5) + - theme_classic() + ggtitle(paste0(sub(':.*','',g),',p=', round(pval[g],3),',f=',round(b[g],2))) + theme(legend.position = 'none') + scale_color_manual(values=c(rep('darkblue',4),rep('orange',4))) - } -# pdf('./hca/geneexpr/plot.mac/age_diff_gene_top_f.pdf',width=12,height=9) -pdf('./hca/geneexpr/plot.mac/age_diff_gene_top_pval.pdf',width=12,height=9) -grid.arrange(grobs=plist,nrow=4) -dev.off() - - - diff --git a/hca_bone_marrow_data_analysis/test_type_position/code/01_test_type_position.R b/hca_bone_marrow_data_analysis/test_type_position/code/01_test_type_position.R deleted file mode 100644 index cc298af..0000000 --- a/hca_bone_marrow_data_analysis/test_type_position/code/01_test_type_position.R +++ /dev/null @@ -1,60 +0,0 @@ -library(parallel) -source('/home-4/whou10@jhu.edu/scratch/Wenpin/trajectory_variability/function/01_function.R') -plotdir <- '/home-4/whou10@jhu.edu/scratch/Wenpin/trajectory_variability/test_type_position/plot/testvar/clusterType9_1/' -rdir <- '/home-4/whou10@jhu.edu/scratch/Wenpin/trajectory_variability/test_type_position/result/testvar/clusterType9_1/' -d <- readRDS('/home-4/whou10@jhu.edu/scratch/Wenpin/trajectory_variability/testvar/data/data/count/clusterType9_1.rds') -rownames(d) <- sub(':.*','',rownames(d)) -m = log2(d + 1) -pt <- readRDS('/home-4/whou10@jhu.edu/scratch/Wenpin/trajectory_variability/testtime/data/data/null/pseudotime.rds') -pseudotime = pt[,2] -names(pseudotime) = pt[,1] -ap <- sub(':.*', '', colnames(m)) -design = cbind(1, c(1,1,0,0,1,1,0,0)) -rownames(design) = paste0('BM', seq(1,8)) -colnames(design) <- c('intersect', 'condition') -ca <- data.frame(Cell = colnames(m), Sample = ap) -dir.create(plotdir, showWarnings = FALSE, recursive = TRUE) -dir.create(rdir, showWarnings = FALSE, recursive = TRUE) -## test and plot -tmp <- mclapply(c('all', 'start', 'middle', 'end'), function(pos){ - ## slope only - res <- testpt(expr = m, cellanno = ca, pseudotime = pseudotime, design=design, permuiter=10, EMmaxiter=100, EMitercutoff=1, verbose=F, ncores=1, type='Variable', test.slope.only = TRUE, test.position = pos) - saveRDS(res, paste0(rdir, 'slope_', pos, '.rds')) - gene = names(rev(sort(abs(res$meandiff)))) - pdf(paste0(plotdir, 'slope_', pos, '_meandiff_genes.pdf'), width = 8, height = 8) - print(plotGene(testptObj = res, gene = gene[1:16], variable = 'condition')) - dev.off() - - gene = names(sort(res$fdr)) - pdf(paste0(plotdir, 'slope_', pos, '_fdr_genes.pdf'), width = 8, height = 8) - print(plotGene(testptObj = res, gene = gene[1:16], variable = 'condition')) - dev.off() - pdf(paste0(plotdir, 'slope_', pos, '_fdr_foldchange.pdf'), width = 4, height = 4) - print(plot(res$foldchange ~ res$fdr[names(res$foldchange)], pch = 20, xlab = 'fdr', ylab = 'LL foldchange')) - dev.off() - pdf(paste0(plotdir, 'slope_', pos, '_fdr_meandiff.pdf'), width = 4, height = 4) - print(plot(res$meandiff ~ res$fdr[names(res$meandiff)], pch = 20, xlab = 'fdr', ylab = 'group mean difference')) - dev.off() - - - ## all (intersept + slope) - res <- testpt(expr = m, cellanno = ca, pseudotime = pseudotime, design=design, permuiter=10, EMmaxiter=100, EMitercutoff=1, verbose=F, ncores=1, type='Variable', test.slope.only = FALSE, test.position = pos) - saveRDS(res, paste0(rdir, 'all_', pos, '.rds')) - gene = names(rev(sort(abs(res$meandiff)))) - pdf(paste0(plotdir, 'all_', pos, '_meandiff_genes.pdf'), width = 8, height = 8) - print(plotGene(testptObj = res, gene = gene[1:16], variable = 'condition')) - dev.off() - - gene = names(sort(res$fdr)) - pdf(paste0(plotdir, 'all_', pos, '_fdr_genes.pdf'), width = 8, height = 8) - print(plotGene(testptObj = res, gene = gene[1:16], variable = 'condition')) - dev.off() - pdf(paste0(plotdir, 'all_', pos, '_fdr_foldchange.pdf'), width = 4, height = 4) - print(plot(res$foldchange ~ res$fdr[names(res$foldchange)], pch = 20, xlab = 'fdr', ylab = 'LL foldchange')) - dev.off() - pdf(paste0(plotdir, 'slope_', pos, '_fdr_meandiff.pdf'), width = 4, height = 4) - print(plot(res$meandiff ~ res$fdr[names(res$meandiff)], pch = 20, xlab = 'fdr', ylab = 'group mean difference')) - dev.off() - - return(0) -}, mc.cores = 8) diff --git a/hca_bone_marrow_data_analysis/test_type_position/code/02_test_type_position_meandiff.R b/hca_bone_marrow_data_analysis/test_type_position/code/02_test_type_position_meandiff.R deleted file mode 100644 index 9280e56..0000000 --- a/hca_bone_marrow_data_analysis/test_type_position/code/02_test_type_position_meandiff.R +++ /dev/null @@ -1,51 +0,0 @@ -library(parallel) -source('/home-4/whou10@jhu.edu/scratch/Wenpin/trajectory_variability/function/01_function.R') -plotdir <- '/home-4/whou10@jhu.edu/scratch/Wenpin/trajectory_variability/test_type_position/plot/' -rdir <- '/home-4/whou10@jhu.edu/scratch/Wenpin/trajectory_variability/test_type_position/result/' -d <- readRDS('/home-4/whou10@jhu.edu/scratch/Wenpin/trajectory_variability/testvar/data/data/count/clusterType9_4.rds') -rownames(d) <- sub(':.*','',rownames(d)) -m = log2(d + 1) -pt <- readRDS('/home-4/whou10@jhu.edu/scratch/Wenpin/trajectory_variability/testtime/data/data/null/pseudotime.rds') -pseudotime = pt[,2] -names(pseudotime) = pt[,1] -ap <- sub(':.*', '', colnames(m)) -design = cbind(1, c(1,1,0,0,1,1,0,0)) -rownames(design) = paste0('BM', seq(1,8)) -colnames(design) <- c('intersect', 'condition') -ca <- data.frame(Cell = colnames(m), Sample = ap) - -## test and plot -tmp <- mclapply(c('all', 'start', 'middle', 'end'), function(pos){ - res <- testpt(expr = m, cellanno = ca, pseudotime = pseudotime, design=design, permuiter=10, EMmaxiter=100, EMitercutoff=1, verbose=F, ncores=1, type='Variable', test.slope.only = TRUE, test.position = pos) - saveRDS(res, paste0(rdir, 'slope_', pos, '.rds')) - gene = names(rev(sort(abs(res$meandiff)))) - pdf(paste0(plotdir, 'slope_', pos, '_meandiff_genes.pdf'), width = 8, height = 8) - print(plotGene(testptObj = res, gene = gene[1:16], variable = 'condition')) - dev.off() - - gene = names(sort(res$fdr)) - pdf(paste0(plotdir, 'slope_', pos, '_fdr_genes.pdf'), width = 8, height = 8) - print(plotGene(testptObj = res, gene = gene[1:16], variable = 'condition')) - dev.off() - pdf(paste0(plotdir, 'slope_', pos, '_fdr_foldchange.pdf'), width = 4, height = 4) - print(plot(res$foldchange ~ res$fdr[names(res$foldchange)], pch = 20, xlab = 'fdr', ylab = 'LL foldchange')) - dev.off() - - - res <- testpt(expr = m, cellanno = ca, pseudotime = pseudotime, design=design, permuiter=10, EMmaxiter=100, EMitercutoff=1, verbose=F, ncores=1, type='Variable', test.slope.only = FALSE, test.position = pos) - saveRDS(res, paste0(rdir, 'all_', pos, '.rds')) - gene = names(rev(sort(abs(res$meandiff)))) - pdf(paste0(plotdir, 'all_', pos, '_meandiff_genes.pdf'), width = 8, height = 8) - print(plotGene(testptObj = res, gene = gene[1:16], variable = 'condition')) - dev.off() - - gene = names(sort(res$fdr)) - pdf(paste0(plotdir, 'all_', pos, '_fdr_genes.pdf'), width = 8, height = 8) - print(plotGene(testptObj = res, gene = gene[1:16], variable = 'condition')) - dev.off() - pdf(paste0(plotdir, 'all_', pos, '_fdr_foldchange.pdf'), width = 4, height = 4) - print(plot(res$foldchange ~ res$fdr[names(res$foldchange)], pch = 20, xlab = 'fdr', ylab = 'LL foldchange')) - dev.off() - return(0) -}, mc.cores = 4) - diff --git a/hca_bone_marrow_data_analysis/testpattern/code/01_test.R b/hca_bone_marrow_data_analysis/testpattern/code/01_test.R deleted file mode 100644 index 52349ba..0000000 --- a/hca_bone_marrow_data_analysis/testpattern/code/01_test.R +++ /dev/null @@ -1,135 +0,0 @@ -# ------------ -# prepare data -# ------------ -library(parallel) -library(splines) -data <- as.character(commandArgs(trailingOnly = TRUE)[[1]][1]) ## clusterType9_1 -print(paste0('Analyzing ',data, '...')) -source('/home-4/whou10@jhu.edu/scratch/Wenpin/trajectory_variability/function/01_function.R') -rdir <- paste0('/home-4/whou10@jhu.edu/scratch/Wenpin/trajectory_variability/testpattern/result/', data, '/') -plotdir <- paste0('/home-4/whou10@jhu.edu/scratch/Wenpin/trajectory_variability/testpattern/plot/', data, '/') -dir.create(rdir, recursive = TRUE) -dir.create(plotdir, recursive = TRUE) -d <- readRDS(paste0('/home-4/whou10@jhu.edu/scratch/Wenpin/trajectory_variability/testvar/data/data/saver/', data, '.rds')) -rownames(d) <- sub(':.*','',rownames(d)) -d <- d[!duplicated(rownames(d)), ] -m = log2(d + 1) -pt <- readRDS('/home-4/whou10@jhu.edu/scratch/Wenpin/trajectory_variability/testtime/data/data/null/pseudotime.rds') -pseudotime = pt[,2] -names(pseudotime) = pt[,1] -ap <- sub(':.*', '', colnames(m)) -design = cbind(1, c(1,1,0,0,1,1,0,0)) -rownames(design) = paste0('BM', seq(1,8)) -colnames(design) <- c('intersect', 'condition') -ca <- data.frame(Cell = colnames(m), Sample = ap, stringsAsFactors = FALSE) - -# ----- -# test -# ----- -system.time({ - Res <- ptest(expr = m, cellanno = ca, pseudotime = pseudotime, design=design, permuiter=10, EMmaxiter=100, EMitercutoff=1, verbose=F, ncores=detectCores(), type='Variable', fit.resolution = 1000) - saveRDS(Res, paste0(rdir, 'ptest_res.rds')) -}) - -# user system elapsed -# 74525.76 11940.81 3409.03 - -names(Res) -str(Res$res) - -# check <<<<<<<<<<<<< -selgene <- readRDS('/home-4/whou10@jhu.edu/scratch/Wenpin/trajectory_variability/testvar/data/data/selgene/selgene.rds') -selgene <- intersect(sub(':.*', '', selgene), rownames(res)) -apply(res[selgene, ], 2, summary) -apply(res[!rownames(res) %in% selgene, ], 2, summary) - -# >>>>>>>>>>>>>>>> - -# ---------------------- -# plot significant genes -# ---------------------- -res <- Res$res -write.csv(Res$res, file = paste0(rdir, 'ptest_res.csv'), quote = FALSE) -# mean diff -res1 <- res[res$meandiff.fdr < 0.05, ] - - -if (nrow(res1) > 0){ - write.csv(res1, file = paste0(rdir, 'ptest_interceptdiff.csv'), quote = FALSE) - pdf(paste0(plotdir, 'interceptdiff_fdr_gene.pdf'), width = 10, height = 7) - gene = rownames(res1)[order(res1[,'meandiff.fdr'])] - print(plotGene(testptObj = Res, gene = gene[1:min(16, nrow(res1))], variable = 'condition', plot.point = T, point.alpha = 0.1, point.size = 0.1)) - dev.off() - - pdf(paste0(plotdir, 'interceptdiff_diff_gene.pdf'), width = 10, height = 7) - gene = rownames(res1)[order(abs(res1[,'meandiff.diff']), decreasing = TRUE)] - print(plotGene(testptObj = Res, gene = gene[1:min(16, nrow(res1))], variable = 'condition', plot.point = T, point.alpha = 0.1, point.size = 0.1)) - dev.off() - - pdf(paste0(plotdir, 'interceptdiff_lfc_gene.pdf'), width = 10, height = 7) - gene = rownames(res1)[order(abs(res1[,'meandiff.lfc']), decreasing = TRUE)] - print(plotGene(testptObj = Res, gene = gene[1:min(16, nrow(res1))], variable = 'condition', plot.point = T, point.alpha = 0.1, point.size = 0.1)) - dev.off() -} else { - print('No interceptdiff fdr < 0.05!') -} - -# trend diff -res2 <- res[res$trenddiff.fdr < 0.05, ] -if (nrow(res2) > 0){ - write.csv(res2, file = paste0(rdir, 'ptest_trenddiff.csv'), quote = FALSE) - pdf(paste0(plotdir, 'trenddiff_fdr_gene.pdf'), width = 10, height = 7) - gene = rownames(res2)[order(res2[,'trenddiff.fdr'])] - print(plotGene(testptObj = Res, gene = gene[1:min(16, nrow(res2))], variable = 'condition', plot.point = T, point.alpha = 0.1, point.size = 0.1)) - dev.off() - - pdf(paste0(plotdir, 'trenddiff_diff_gene.pdf'), width = 10, height = 7) - gene = rownames(res2)[order(abs(res2[,'trenddiff.diff']), decreasing = TRUE)] - print(plotGene(testptObj = Res, gene = gene[1:min(16, nrow(res2))], variable = 'condition', plot.point = T, point.alpha = 0.1, point.size = 0.1)) - dev.off() - - pdf(paste0(plotdir, 'trenddiff_lfc_gene.pdf'), width = 10, height = 7) - gene = rownames(res2)[order(abs(res2[,'trenddiff.lfc']), decreasing = TRUE)] - print(plotGene(testptObj = Res, gene = gene[1:min(16, nrow(res2))], variable = 'condition', plot.point = T, point.alpha = 0.1, point.size = 0.1)) - dev.off() -} else { - print('No trenddiff fdr < 0.05!') -} - -## intercept diff but no trend diff -res3 <- res[res$meandiff.fdr < 0.05 & res$trenddiff.fdr > 0.05, ] -if (nrow(res3) > 0){ - write.csv(res3, file = paste0(rdir, 'ptest_interceptdiff_butNoTrenddiff.csv'), quote = FALSE) - pdf(paste0(plotdir, 'interceptdiff_butNoTrenddiff_fdr_gene.pdf'), width = 10, height = 7) - gene = rownames(res3)[order(res3[,'trenddiff.fdr'])] - print(plotGene(testptObj = Res, gene = gene[1:min(16, nrow(res3))], variable = 'condition', plot.point = T, point.alpha = 0.1, point.size = 0.1)) - dev.off() -} - -# ------------------------ -# plot insignificant genes -# ------------------------ -# intercept diff -res4 <- res[res$meandiff.fdr > 0.05, , drop = FALSE] -if (nrow(res4) > 0){ - write.csv(res4, file = paste0(rdir, 'ptest_interceptdiff_insig.csv'), quote = FALSE) - pdf(paste0(plotdir, 'interceptdiff_fdr_gene_insig.pdf'), width = 10, height = 7) - gene = rev(rownames(res4)[order(res4[,'meandiff.fdr'])]) - print(plotGene(testptObj = Res, gene = gene[1:min(16, nrow(res4))], variable = 'condition', plot.point = T, point.alpha = 0.1, point.size = 0.1)) - dev.off() -} else { - print('No interceptdiff fdr > 0.05!') -} - -# trend diff -res5 <- res[res$trenddiff.fdr > 0.05, , drop = FALSE] -if (nrow(res5) > 0){ - write.csv(res5, file = paste0(rdir, 'ptest_trenddiff_insig.csv'), quote = FALSE) - pdf(paste0(plotdir, 'trenddiff_fdr_gene_insig.pdf'), width = 10, height = 7) - gene = rownames(res5)[order(res5[,'trenddiff.fdr'])] - print(plotGene(testptObj = Res, gene = gene[1:min(16, nrow(res5))], variable = 'condition', plot.point = T, point.alpha = 0.1, point.size = 0.1)) - dev.off() -} else { - print('No trenddiff fdr > 0.05!') -} - diff --git a/hca_bone_marrow_data_analysis/testpattern/code/02_population_pattern.R b/hca_bone_marrow_data_analysis/testpattern/code/02_population_pattern.R deleted file mode 100644 index 3690412..0000000 --- a/hca_bone_marrow_data_analysis/testpattern/code/02_population_pattern.R +++ /dev/null @@ -1,98 +0,0 @@ -# new a function -# input: testpt output including beta, phi -# input: covariables values user wants to know , if NULL then the unique values of the covatiates in the testpt data. if in the data only have age = 10, 20, 30, users can input 25 then we can output the pseudotime pattern of age == 25. -# phi * x * beta - -data <- as.character(commandArgs(trailingOnly = T)[[1]][1]) -# data = 'clusterType10_1' -source('/home-4/whou10@jhu.edu/scratch/Wenpin/resource/myfunc/01_function.R') -source('/home-4/whou10@jhu.edu/scratch/Wenpin/trajectory_variability/function/01_function.R') -plotdir <- paste0('/home-4/whou10@jhu.edu/scratch/Wenpin/trajectory_variability/testpattern/plot/', data, '/') -rdir <- paste0('/home-4/whou10@jhu.edu/scratch/Wenpin/trajectory_variability/testpattern/result/', data, '/') - -Res <- readRDS(paste0('/home-4/whou10@jhu.edu/scratch/Wenpin/trajectory_variability/testpattern/result/', data, '/ptest_res.rds')) -res <- Res$res -head(res) -res <- res[res[,1] < 0.05 | res[,4] < 0.05, ] -gene <- rownames(res[res[,4] < 0.05, ]) -expr = Res$expression[gene, ] -knotnum = Res$knotnum[gene] -design = Res$design -cellanno = Res$cellanno -rownames(cellanno) = cellanno[,1] -pseudotime = Res$pseudotime - -beta <- lapply(Res$trenddiff.parameter[gene], function(i){ - i$beta -}) -names(beta) <- gene -predict.values <- Res$predict.values[gene,] - -pseudotime = pseudotime[order(pseudotime)] -expr = expr[, names(pseudotime)] -predict.values = predict.values[, names(pseudotime)] - -# plot(fit1~pseudotime, col = 'red', pch = 20, cex = .5,ylim=c(0,3)) -# points(fit2~pseudotime, pch = 20, cex = .5) -library(splines) -fit <- sapply(gene, function(g){ - print(g) - fit <- get_population_fit(Res, 'condition', g = g) - vn <- sapply(1:length(fit), function(i){ - paste0(names(fit)[i], ';', rownames(fit[[i]])) - }) - v <- as.vector(do.call(cbind, fit)) - names(v) <- vn - v -}) - -clu <- mykmeans(fit,10)$cluster -agg <- aggregate(t(fit),list(clu),mean) -agg <- agg[,-1] -agg <- as.matrix(agg) -rownames(agg) <- paste0('cluster', seq(1, nrow(agg))) - -saveRDS(fit, paste0(rdir, 'trenddiff_gene_popoulation_fit_10clu.rds')) -saveRDS(clu, paste0(rdir, 'trenddiff_gene_cluster_10clu.rds')) -saveRDS(agg, paste0(rdir, 'trenddiff_gene_agg_10clu.rds')) - -library(reshape2) -pd <- melt(agg) -pd$covariate <- as.factor(sub(';.*', '', pd[,2])) -pd$x <- as.numeric(pseudotime[sub('.*;', '', pd[,2])]) -pd$cell <- sub('.*;', '', pd$Var2) -library(ggplot2) - -pdf(paste0(plotdir, 'population_level_trenddiff_gene_10clu.pdf'), width = 4, height = max(clu)*2) -ggplot() + - geom_point(data = pd, aes(x = x, y = value, group = covariate,color = pd$covariate)) + - theme_classic() + - scale_color_brewer(palette = 'Dark2') + - xlab('Pseudotime') + ylab('Expression') + - labs(color = NULL) + - facet_wrap(~Var1, ncol = 1) -dev.off() - -library(pheatmap) -library(RColorBrewer) -hmpd <- predict.values[names(sort(clu)), names(pseudotime)] -hmpd <- hmpd[, order(as.character(pd[match(colnames(hmpd), pd$cell),'covariate']), pseudotime[colnames(hmpd)])] -anno_row <- data.frame(cluster = as.factor(clu[rownames(hmpd)])) -rownames(anno_row) = rownames(hmpd) -anno_col <- data.frame(covariate = pd[match(colnames(hmpd), pd$cell),'covariate']) -rownames(anno_col) <- colnames(hmpd) - - -png(paste0(plotdir, 'population_level_trenddiff_gene_hm_10clu.png'), width = 600, height = 600) -pheatmap(hmpd, cluster_cols = FALSE, cluster_rows = FALSE, - show_rownames = FALSE, show_colnames = FALSE, - annotation_row = anno_row, - annotaton_col = anno_col) -dev.off() - - -selgene <- readRDS('/home-4/whou10@jhu.edu/scratch/Wenpin/trajectory_variability/testvar/data/data/selgene/selgene.rds') -selgene = sub(':.*','',selgene) -df <- data.frame(interceptdiff = ifelse(res[,1]<0.05, 'interceptdiff', 'nointerceptdiff'), trenddiff = ifelse(res[,4]<0.05, 'trenddiff','notrenddiff'), selgene = ifelse(rownames(res) %in% selgene, 'true', 'false')) -rownames(df) = rownames(res) - diff --git a/hca_bone_marrow_data_analysis/tree_variability/auto_pc_14_clu_permute1e2/code/01_reproducibility.R b/hca_bone_marrow_data_analysis/tree_variability/auto_pc_14_clu_permute1e2/code/01_reproducibility.R deleted file mode 100644 index a74ad5e..0000000 --- a/hca_bone_marrow_data_analysis/tree_variability/auto_pc_14_clu_permute1e2/code/01_reproducibility.R +++ /dev/null @@ -1,254 +0,0 @@ -rm(list=ls()) -library(ggplot2) -library(Seurat) -library(reshape2) -library(TSCAN) -n.permute <- 100 -suppressMessages(library(igraph)) -setwd("/Users/wenpinhou/Dropbox/trajectory_variability/hca/data/HCA/proc/integrate") - -umap = readRDS('umap.rds') -pca <- as.matrix(umap@reductions$pca@cell.embeddings) - -ctlevel <- data.frame(ct=c('HSC','MPP','LMPP','CMP','CLP','GMP','MEP',"Bcell","CD4Tcell","CD8Tcell",'NKcell','Mono','Ery'),level=c(1,2,3,3,4,4,4,5,5,5,5,5,5),immunepath=c(1,1,1,0,1,0,0,1,1,1,1,0,0),monopath=c(1,1,1,1,0,1,0,0,0,0,0,1,0),erypath=c(1,1,0,1,0,0,1,0,0,0,0,0,1),stringsAsFactors = F) -str(pca) - -a = readRDS('/Users/wenpinhou/Dropbox/trajectory_variability/hca/data/HCA/proc/ct/sc.rds') -ct = data.frame(cell = names(a), celltype = a, stringsAsFactors = FALSE) - - -### determine numPC -mykmeans <- function(matrix, number.cluster = NA){ - ## cluster the rows - set.seed(12345) - library(parallel) - if (is.na(number.cluster)){ - maxclunum <- 20 - rss <- mclapply(1:maxclunum,function(clunum) { - tmp <- kmeans(matrix,clunum,iter.max = 1000) - tmp$betweenss/tmp$totss - },mc.cores=20) - rss <- unlist(rss) - x <- 1:maxclunum - optclunum <- which.min(sapply(1:maxclunum, function(i) { - x2 <- pmax(0, x - i) - sum(lm(rss ~ x + x2)$residuals^2) ## check this - })) - clu <- kmeans(matrix,optclunum) - } else { - clu <- kmeans(matrix, number.cluster) - } - return(clu) -} - -set.seed(12345) -sdev <- apply(pca, 2, sd) -x <- 1:50 -optpoint <- which.min(sapply(2:20, function(i) { - x2 <- pmax(0, x - i) - sum(lm(sdev ~ x + x2)$residuals^2) -})) -pcadim = optpoint + 1 -pr <- pca[,1:pcadim] # 7 -# pr <- pca[,1:2] - -## clustering -clu <- mykmeans(pr, number.cluster = 14)$cluster -pd = data.frame(x = pr[,1], y = pr[,2], clu = as.factor(clu[rownames(pr)])) - -mypalette = colorRampPalette(brewer.pal(9,'Set1')) -ggplot(data = pd, aes(x = x, y = y, color = clu)) + - geom_scattermore()+ - scale_color_manual(values = mypalette(14)) - -## cell type composition in clusters -pd = cbind(pd, celltype = ct[match(rownames(pd), ct[,1]),2]) -tab <- table(pd[,3:4]) -tab <- tab/rowSums(tab) -pd <- melt(tab) -pd$clu <- factor(as.character(pd$clu), levels = seq(1,max(pd$clu))) - -ggplot(data = pd) + - geom_bar(aes(x = clu, y = value, fill = celltype), stat = 'identity', position = 'dodge') + - theme_classic() + - ylab('Celltype Proportion') + - scale_fill_manual(values = mypalette(length(unique(pd$celltype)))) - -# tmp <- which.min(sapply(1:clun,function(scn) mean(ctlevel[match(ct[match(names(clu)[clu==scn],ct[,1]),3],ctlevel[,1]),2],na.rm=T))) -# - -### -mcl <- exprmclust(t(pr),cluster=clu,reduce=F) -# mcl <- exprmclust(t(pr), reduce = F) -plotmclust(mcl, cell_point_size = 0.1) -str(mcl) -## find origin -tmp <- pd[pd$celltype == 'HSC', ] -origin.cluster <- as.numeric(tmp[which.max(tmp[,3]), 1]) - - -ord <- TSCANorder(mcl, startcluster = origin.cluster, listbranch = T,orderonly = T) -str(ord) -length(ord) -pt <- data.frame(cell = unname(unlist(ord)), time = c(1:length(ord[[1]]), 1:length(ord[[2]]), 1:length(ord[[3]]), 1:length(ord[[4]]), 1:length(ord[[5]]), 1:length(ord[[6]])), stringsAsFactors = FALSE) - -## plot pseudotime -pd = data.frame(pc1 = pca[,1], pc2 = pca[,2], time = as.numeric(pt[match(rownames(pca), pt[,1]),2])) -library(scattermore) -library(RColorBrewer) - - -ggplot(data = pd, aes(x = pc1, y = pc2, color = time)) + - geom_scattermore() + - scale_color_gradient(low = 'yellow', high = 'blue') - -# ----------- -# permutation -# ----------- -jslist <- oclist <- list() -for (pmid in seq(1, n.permute)){ - ## boostrap cells - print(pmid) - set.seed(pmid) - pr.pm <- pr[sample(rownames(pr), nrow(pr), replace = TRUE),] - pr.pm <- pr.pm[!duplicated(rownames(pr.pm)),] - - ## cluster cells - clu <- mykmeans(pr.pm, number.cluster = 14)$cluster ### - mcl.pm <- exprmclust(t(pr.pm), cluster = clu, reduce = FALSE) ### - # plotmclust(mcl.pm, cell_point_size = 0.1) - - ## select origin cluster - pt.pm.mean<- tapply(pt[match(names(mcl.pm[['clusterid']]), pt[,1]),2], list(mcl.pm[['clusterid']]), mean) - start.cluster <- names(which.min(pt.pm.mean)) - - ## construct pseudotime - ord.pm <- TSCANorder(mcl.pm, startcluster = start.cluster, listbranch = T,orderonly = T) - # str(ord.pm) - - ## plot pseudotime - - pt.pm <- data.frame(cell = unname(unlist(ord.pm)), time = unlist(sapply(sapply(ord.pm, length), function(i) seq(1, i))), stringsAsFactors = FALSE) - pd = data.frame(pc1 = pca[,1], pc2 = pca[,2], time = as.numeric(pt.pm[match(rownames(pca), pt.pm[,1]),2])) - library(scattermore) - library(RColorBrewer) - # ggplot(data = pd, aes(x = pc1, y = pc2, color = time)) + - # geom_scattermore() - - ## compare two MST - js <- sapply(seq(1, length(ord)), function(i){ - sapply(seq(1, length(ord.pm)), function(j){ - b.ori <- ord[[i]] - b.pm <- ord.pm[[j]] - js <- length(intersect(b.pm, b.ori))/length(union(b.pm, b.ori)) - }) - }) - - oc <- sapply(seq(1, length(ord)), function(i){ - sapply(seq(1, length(ord.pm)), function(j){ - b.ori <- ord[[i]] - b.pm <- ord.pm[[j]] - oc <- length(intersect(b.pm, b.ori))/min(length(b.pm), length(b.ori)) - }) - }) - colnames(oc) <- colnames(js) <- paste0('original', seq(1, length(ord))) - jslist[[pmid]] <- js - oclist[[pmid]] <- oc -} -saveRDS(jslist, '/Users/wenpinhou/Dropbox/trajectory_variability/tree_variability/result/pm_js.rds') -saveRDS(oclist, '/Users/wenpinhou/Dropbox/trajectory_variability/tree_variability/result/pm_oc.rds') - - - -jsm <- do.call(rbind, jslist) -ocm <- do.call(rbind, oclist) -str(jsm) -str(ocm) -par(mfrow = c(1,2)) -hist(jsm) -hist(ocm) - -js.cut <- 0.5 -oc.cut <- 0.6 - -res <- sapply(seq(1,length(jslist)), function(i){ - js <- jslist[[i]] - js.binary <- (js > js.cut) + 0 - - while (length(which(rowSums(js.binary) > 1)) > 0 | length(which(colSums(js.binary) > 1)) > 0){ - dup.id <- which(rowSums(js.binary) > 1) - if (length(dup.id) == 1){ - addid <- which.max(js[dup.id, ]) - js.binary[dup.id, ] <- 0 - js.binary[dup.id, addid] <- 1 - } else if (length(dup.id) > 1) { - for (dup.i in dup.id){ - addid <- which.max(js[dup.i, ]) - js.binary[dup.i, ] <- 0 - js.binary[dup.i, addid] <- 1 - } - } - - dup.id <- which(colSums(js.binary) > 1) - if (length(dup.id) == 1){ - addid <- which.max(js[, dup.id]) - js.binary[dup.id, ] <- 0 - js.binary[addid, dup.id] <- 1 - } else if (length(dup.id) > 1) { - for (dup.i in dup.id){ - addid <- which.max(js[, dup.id]) - js.binary[, dup.id] <- 0 - js.binary[addid, dup.id] <- 1 - } - } - } - js.melt <- melt(js.binary) - js.melt <- js.melt[js.melt[,3]!=0,] - as.character(js.melt[,2]) -}) -res <- unlist(res) -js.perc <- table(res)/n.permute -saveRDS(js.perc, '/Users/wenpinhou/Dropbox/trajectory_variability/tree_variability/result/js_percentage.rds') - -res <- sapply(seq(1,length(oclist)), function(i){ - oc <- oclist[[i]] - oc.binary <- (oc > oc.cut) + 0 - while (length(which(rowSums(oc.binary) > 1)) > 0 | length(which(colSums(oc.binary) > 1)) > 0){ - dup.id <- which(rowSums(oc.binary) > 1) - if (length(dup.id) == 1){ - addid <- which.max(oc[dup.id, ]) - oc.binary[dup.id, ] <- 0 - oc.binary[dup.id, addid] <- 1 - } else if (length(dup.id) > 1) { - for (dup.i in dup.id){ - addid <- which.max(oc[dup.i, ]) - oc.binary[dup.i, ] <- 0 - oc.binary[dup.i, addid] <- 1 - } - } - - dup.id <- which(colSums(oc.binary) > 1) - if (length(dup.id) == 1){ - addid <- which.max(oc[, dup.id]) - oc.binary[, dup.id] <- 0 - oc.binary[addid, dup.id] <- 1 - } else if (length(dup.id) > 1) { - for (dup.i in dup.id){ - addid <- which.max(oc[, dup.i]) - oc.binary[, dup.i] <- 0 - oc.binary[addid, dup.i] <- 1 - } - } - } - - oc.melt <- melt(oc.binary) - oc.melt <- oc.melt[oc.melt[,3]!=0,] - if (length(oc.melt[,2]) > 6) print(i) - as.character(oc.melt[,2]) -}) -res <- unlist(res) -oc.perc <- table(res)/n.permute -saveRDS(oc.perc, '/Users/wenpinhou/Dropbox/trajectory_variability/tree_variability/result/oc_percentage.rds') - - - diff --git a/hca_bone_marrow_data_analysis/tree_variability/auto_pc_auto_clu_permute1e3/code/01_reproducibility.R b/hca_bone_marrow_data_analysis/tree_variability/auto_pc_auto_clu_permute1e3/code/01_reproducibility.R deleted file mode 100644 index ecb0151..0000000 --- a/hca_bone_marrow_data_analysis/tree_variability/auto_pc_auto_clu_permute1e3/code/01_reproducibility.R +++ /dev/null @@ -1,236 +0,0 @@ -rm(list=ls()) -library(ggplot2) -library(Seurat) -library(reshape2) -library(TSCAN) -library(scattermore) -library(RColorBrewer) -suppressMessages(library(igraph)) -setwd("/Users/wenpinhou/Dropbox/trajectory_variability/hca/data/HCA/proc/integrate") - -umap = readRDS('umap.rds') -pca <- as.matrix(umap@reductions$pca@cell.embeddings) - -ctlevel <- data.frame(ct=c('HSC','MPP','LMPP','CMP','CLP','GMP','MEP',"Bcell","CD4Tcell","CD8Tcell",'NKcell','Mono','Ery'),level=c(1,2,3,3,4,4,4,5,5,5,5,5,5),immunepath=c(1,1,1,0,1,0,0,1,1,1,1,0,0),monopath=c(1,1,1,1,0,1,0,0,0,0,0,1,0),erypath=c(1,1,0,1,0,0,1,0,0,0,0,0,1),stringsAsFactors = F) -str(pca) - -a = readRDS('/Users/wenpinhou/Dropbox/trajectory_variability/hca/data/HCA/proc/ct/sc.rds') -ct = data.frame(cell = names(a), celltype = a, stringsAsFactors = FALSE) - -mykmeans <- function(matrix, number.cluster = NA){ - ## cluster the rows - set.seed(12345) - library(parallel) - if (is.na(number.cluster)){ - maxclunum <- 20 - rss <- mclapply(1:maxclunum,function(clunum) { - tmp <- kmeans(matrix,clunum,iter.max = 1000) - tmp$betweenss/tmp$totss - },mc.cores=20) - rss <- unlist(rss) - x <- 1:maxclunum - optclunum <- which.min(sapply(1:maxclunum, function(i) { - x2 <- pmax(0, x - i) - sum(lm(rss ~ x + x2)$residuals^2) ## check this - })) - clu <- kmeans(matrix,optclunum) - } else { - clu <- kmeans(matrix, number.cluster) - } - return(clu) -} - -# set.seed(12345) -# library(umap) -# u <- umap(pca[,1:10])$layout - -# ggplot(data.frame(u1=u[,1],u2=u[,2],ct=ct[match(rownames(u),ct[,1]),2]),aes(x=u1,y=u2,col=ct)) + geom_point() + facet_wrap(~ct) - -### determine numPC -set.seed(12345) -sdev <- apply(pca, 2, sd) -x <- 1:20 -optpoint <- which.min(sapply(2:20, function(i) { - x2 <- pmax(0, x - i) - sum(lm(sdev[1:20] ~ x + x2)$residuals^2) -})) -pcadim = optpoint + 1 -pr <- pca[,1:pcadim] # 2 -# pr <- pca[,1:2] - -# ## clustering -# clu <- mykmeans(pr, number.cluster = 14)$cluster -# pd = data.frame(x = pr[,1], y = pr[,2], clu = as.factor(clu[rownames(pr)])) -# -# mypalette = colorRampPalette(brewer.pal(9,'Set1')) -# ggplot(data = pd, aes(x = x, y = y, color = clu)) + -# geom_scattermore()+ -# scale_color_manual(values = mypalette(14)) -# -# ## cell type composition in clusters -# pd = cbind(pd, celltype = ct[match(rownames(pd), ct[,1]),2]) -# tab <- table(pd[,3:4]) -# tab <- tab/rowSums(tab) -# pd <- melt(tab) -# pd$clu <- factor(as.character(pd$clu), levels = seq(1,max(pd$clu))) -# -# ggplot(data = pd) + -# geom_bar(aes(x = clu, y = value, fill = celltype), stat = 'identity', position = 'dodge') + -# theme_classic() + -# ylab('Celltype Proportion') + -# scale_fill_manual(values = mypalette(length(unique(pd$celltype)))) - -# tmp <- which.min(sapply(1:clun,function(scn) mean(ctlevel[match(ct[match(names(clu)[clu==scn],ct[,1]),3],ctlevel[,1]),2],na.rm=T))) -# - -### mclust -# mcl <- exprmclust(t(pr),cluster=clu,reduce=F) -mcl <- exprmclust(t(pr), reduce = F) -plotmclust(mcl, cell_point_size = 0.1) -str(mcl) -## find origin -pd = data.frame(x = pr[,1], y = pr[,2], clu = as.factor(mcl$clusterid)) -pd = cbind(pd, celltype = ct[match(rownames(pd), ct[,1]),2]) -tab <- table(pd[,3:4]) -tab <- tab/rowSums(tab) -pd <- melt(tab) -pd$clu <- factor(as.character(pd$clu), levels = seq(1,max(pd$clu))) - -tmp <- pd[pd$celltype == 'HSC', ] -origin.cluster <- as.numeric(tmp[which.max(tmp[,3]), 1]) - - -ord <- TSCANorder(mcl, startcluster = origin.cluster, listbranch = T,orderonly = T) -str(ord) -length(ord) -pt <- data.frame(cell = unname(unlist(ord)), time = unlist(sapply(sapply(ord, length), function(i) seq(1, i))), stringsAsFactors = FALSE) - -## plot pseudotime -pd = data.frame(pc1 = pca[,1], pc2 = pca[,2], time = as.numeric(pt[match(rownames(pca), pt[,1]),2])) -library(scattermore) -library(RColorBrewer) -ggplot(data = pd, aes(x = pc1, y = pc2, color = time)) + - geom_scattermore() + - scale_color_gradient(low = 'yellow', high = 'blue') - -# ----------- -# permutation -# ----------- -jslist <- oclist <- list() -for (pmid in seq(1, 1e3)){ - ## boostrap cells - print(pmid) - set.seed(pmid) - pr.pm <- pr[sample(rownames(pr), nrow(pr), replace = TRUE),] - pr.pm <- pr.pm[!duplicated(rownames(pr.pm)),] - - # ## cluster cells - # clu <- mykmeans(pr.pm, number.cluster = 14)$cluster ### - # mcl.pm <- exprmclust(t(pr.pm), cluster = clu, reduce = FALSE) ### - mcl.pm <- exprmclust(t(pr.pm), reduce = FALSE) ### - # plotmclust(mcl.pm, cell_point_size = 0.1) - - ## select origin cluster - pt.pm.mean<- tapply(pt[match(names(mcl.pm[['clusterid']]), pt[,1]),2], list(mcl.pm[['clusterid']]), mean) - start.cluster <- names(which.min(pt.pm.mean)) - - ## construct pseudotime - ord.pm <- TSCANorder(mcl.pm, startcluster = start.cluster, listbranch = T,orderonly = T) - # str(ord.pm) - - ## plot pseudotime - - pt.pm <- data.frame(cell = unname(unlist(ord.pm)), time = unlist(sapply(sapply(ord.pm, length), function(i) seq(1, i))), stringsAsFactors = FALSE) - pd = data.frame(pc1 = pca[,1], pc2 = pca[,2], time = as.numeric(pt.pm[match(rownames(pca), pt.pm[,1]),2])) - # ggplot(data = pd, aes(x = pc1, y = pc2, color = time)) + - # geom_scattermore() - - ## compare two MST - js <- sapply(seq(1, length(ord)), function(i){ - sapply(seq(1, length(ord.pm)), function(j){ - b.ori <- ord[[i]] - b.pm <- ord.pm[[j]] - js <- length(intersect(b.pm, b.ori))/length(union(b.pm, b.ori)) - }) - }) - - oc <- sapply(seq(1, length(ord)), function(i){ - sapply(seq(1, length(ord.pm)), function(j){ - b.ori <- ord[[i]] - b.pm <- ord.pm[[j]] - oc <- length(intersect(b.pm, b.ori))/min(length(b.pm), length(b.ori)) - }) - }) - colnames(oc) <- colnames(js) <- paste0('original', seq(1, length(ord))) - jslist[[pmid]] <- js - oclist[[pmid]] <- oc -} -saveRDS(jslist, '/Users/wenpinhou/Dropbox/trajectory_variability/tree_variability/result/pm_js.rds') -saveRDS(oclist, '/Users/wenpinhou/Dropbox/trajectory_variability/tree_variability/result/pm_oc.rds') - - - -jsm <- do.call(rbind, jslist) -ocm <- do.call(rbind, oclist) -str(jsm) -str(ocm) -par(mfrow = c(1,2)) -hist(jsm) -hist(ocm) - -js.cut <- 0.5 -oc.cut <- 0.6 - -res <- sapply(seq(1,length(jslist)), function(i){ - js <- jslist[[i]] - js.binary <- (js > js.cut) + 0 - dup.id <- which(rowSums(js.binary) > 1) - if (length(dup.id) == 1){ - addid <- which.max(js[dup.id, ]) - js.binary[dup.id, ] <- 0 - js.binary[dup.id, addid] <- 1 - } else if (length(dup.id) > 1) { - for (dup.i in dup.id){ - addid <- which.max(js[dup.i, ]) - js.binary[dup.i, ] <- 0 - js.binary[dup.i, addid] <- 1 - } - } - js.melt <- melt(js.binary) - js.melt <- js.melt[js.melt[,3]!=0,] - as.character(js.melt[,2]) -}) -res <- unlist(res) -js.perc <- table(res)/1e3 -saveRDS(js.perc, '/Users/wenpinhou/Dropbox/trajectory_variability/tree_variability/result/js_percentage.rds') - -res <- sapply(seq(1,length(oclist)), function(i){ - oc <- oclist[[i]] - oc.binary <- (oc > oc.cut) + 0 - dup.id <- which(rowSums(oc.binary) > 1) - if (length(dup.id) == 1){ - addid <- which.max(oc[dup.id, ]) - oc.binary[dup.id, ] <- 0 - oc.binary[dup.id, addid] <- 1 - } else if (length(dup.id) > 1) { - for (dup.i in dup.id){ - addid <- which.max(oc[dup.i, ]) - oc.binary[dup.i, ] <- 0 - oc.binary[dup.i, addid] <- 1 - } - } - - oc.melt <- melt(oc.binary) - oc.melt <- oc.melt[oc.melt[,3]!=0,] - as.character(oc.melt[,2]) -}) -res <- unlist(res) -oc.perc <- table(res)/1e3 - -sort((js.perc + oc.perc)/2) -saveRDS(oc.perc, '/Users/wenpinhou/Dropbox/trajectory_variability/tree_variability/result/oc_percentage.rds') - - - - - diff --git a/hca_bone_marrow_data_analysis/tree_variability/auto_pc_auto_clu_permute1e3_0.99QuantileCutoff/code/01_reproducibility.R b/hca_bone_marrow_data_analysis/tree_variability/auto_pc_auto_clu_permute1e3_0.99QuantileCutoff/code/01_reproducibility.R deleted file mode 100644 index f411805..0000000 --- a/hca_bone_marrow_data_analysis/tree_variability/auto_pc_auto_clu_permute1e3_0.99QuantileCutoff/code/01_reproducibility.R +++ /dev/null @@ -1,225 +0,0 @@ -rm(list=ls()) -library(ggplot2) -library(Seurat) -library(reshape2) -library(TSCAN) -library(scattermore) -library(RColorBrewer) -suppressMessages(library(igraph)) -setwd("/Users/wenpinhou/Dropbox/trajectory_variability/hca/data/HCA/proc/integrate") -umap = readRDS('umap.rds') -pca <- as.matrix(umap@reductions$pca@cell.embeddings) -ctlevel <- data.frame(ct=c('HSC','MPP','LMPP','CMP','CLP','GMP','MEP',"Bcell","CD4Tcell","CD8Tcell",'NKcell','Mono','Ery'),level=c(1,2,3,3,4,4,4,5,5,5,5,5,5),immunepath=c(1,1,1,0,1,0,0,1,1,1,1,0,0),monopath=c(1,1,1,1,0,1,0,0,0,0,0,1,0),erypath=c(1,1,0,1,0,0,1,0,0,0,0,0,1),stringsAsFactors = F) -str(pca) -a = readRDS('/Users/wenpinhou/Dropbox/trajectory_variability/hca/data/HCA/proc/ct/sc.rds') -ct = data.frame(cell = names(a), celltype = a, stringsAsFactors = FALSE) - -mykmeans <- function(matrix, number.cluster = NA){ - ## cluster the rows - set.seed(12345) - library(parallel) - if (is.na(number.cluster)){ - maxclunum <- 20 - rss <- mclapply(1:maxclunum,function(clunum) { - tmp <- kmeans(matrix,clunum,iter.max = 1000) - tmp$betweenss/tmp$totss - },mc.cores=20) - rss <- unlist(rss) - x <- 1:maxclunum - optclunum <- which.min(sapply(1:maxclunum, function(i) { - x2 <- pmax(0, x - i) - sum(lm(rss ~ x + x2)$residuals^2) ## check this - })) - clu <- kmeans(matrix,optclunum) - } else { - clu <- kmeans(matrix, number.cluster) - } - return(clu) -} - -### determine numPC -set.seed(12345) -sdev <- apply(pca, 2, sd) -x <- 1:20 -optpoint <- which.min(sapply(2:20, function(i) { - x2 <- pmax(0, x - i) - sum(lm(sdev[1:20] ~ x + x2)$residuals^2) -})) -pcadim = optpoint + 1 -pr <- pca[,1:pcadim] # 2 - -### mclust -# mcl <- exprmclust(t(pr),cluster=clu,reduce=F) -mcl <- exprmclust(t(pr), reduce = F) -plotmclust(mcl, cell_point_size = 0.1) -str(mcl) -## find origin -pd = data.frame(x = pr[,1], y = pr[,2], clu = as.factor(mcl$clusterid)) -pd = cbind(pd, celltype = ct[match(rownames(pd), ct[,1]),2]) -tab <- table(pd[,3:4]) -tab <- tab/rowSums(tab) -pd <- melt(tab) -pd$clu <- factor(as.character(pd$clu), levels = seq(1,max(pd$clu))) - -tmp <- pd[pd$celltype == 'HSC', ] -origin.cluster <- as.numeric(tmp[which.max(tmp[,3]), 1]) - -## construct pseudotime -ord <- TSCANorder(mcl, startcluster = origin.cluster, listbranch = T,orderonly = T) -str(ord) -length(ord) -pt <- data.frame(cell = unname(unlist(ord)), time = unlist(sapply(sapply(ord, length), function(i) seq(1, i))), stringsAsFactors = FALSE) - -## plot pseudotime -pd = data.frame(pc1 = pca[,1], pc2 = pca[,2], time = as.numeric(pt[match(rownames(pca), pt[,1]),2])) -library(scattermore) -library(RColorBrewer) -ggplot(data = pd, aes(x = pc1, y = pc2, color = time)) + - geom_scattermore() + - scale_color_gradient(low = 'yellow', high = 'blue') - -# ------------------------------------------------------- -# null distribution of Jaccard index, overlap coefficient -# ------------------------------------------------------- -js.null <- lapply(seq(1, length(ord)), function(i){ - b.ori <- ord[[i]] - tmp <- sapply(seq(1, 1e3), function(j){ - set.seed(j) - b.pm <- sample(rownames(pr), length(b.ori)) - length(intersect(b.pm, b.ori))/length(union(b.pm, b.ori)) - }) -}) - -par(mfrow = c(1,3)) -hist(js.null[[1]]) -hist(js.null[[2]]) -hist(js.null[[3]]) - -js.cut <- sapply(js.null, quantile, 0.99) - -oc.null <- lapply(seq(1, length(ord)), function(i){ - b.ori <- ord[[i]] - tmp <- sapply(seq(1, 1e3), function(j){ - set.seed(j) - b.pm <- sample(rownames(pr), length(b.ori)) - length(intersect(b.pm, b.ori))/min(length(b.pm), length(b.ori)) - }) -}) -par(mfrow = c(1,3)) -hist(oc.null[[1]]) -hist(oc.null[[2]]) -hist(oc.null[[3]]) - -oc.cut <- sapply(oc.null, quantile, 0.99) - -# ----------- -# permutation -# ----------- -jslist <- oclist <- list() -for (pmid in seq(1, 1e3)){ - ## boostrap cells - print(pmid) - set.seed(pmid) - pr.pm <- pr[sample(rownames(pr), nrow(pr), replace = TRUE),] - pr.pm <- pr.pm[!duplicated(rownames(pr.pm)),] - - # ## cluster cells - mcl.pm <- exprmclust(t(pr.pm), reduce = FALSE) ### - # plotmclust(mcl.pm, cell_point_size = 0.1) - - ## select origin cluster - pt.pm.mean<- tapply(pt[match(names(mcl.pm[['clusterid']]), pt[,1]),2], list(mcl.pm[['clusterid']]), mean) - start.cluster <- names(which.min(pt.pm.mean)) - - ## construct pseudotime - ord.pm <- TSCANorder(mcl.pm, startcluster = start.cluster, listbranch = T,orderonly = T) - # str(ord.pm) - - ## plot pseudotime - - pt.pm <- data.frame(cell = unname(unlist(ord.pm)), time = unlist(sapply(sapply(ord.pm, length), function(i) seq(1, i))), stringsAsFactors = FALSE) - pd = data.frame(pc1 = pca[,1], pc2 = pca[,2], time = as.numeric(pt.pm[match(rownames(pca), pt.pm[,1]),2])) - # ggplot(data = pd, aes(x = pc1, y = pc2, color = time)) + - # geom_scattermore() - - ## compare two MST - js <- sapply(seq(1, length(ord)), function(i){ - sapply(seq(1, length(ord.pm)), function(j){ - b.ori <- ord[[i]] - b.pm <- ord.pm[[j]] - js <- length(intersect(b.pm, b.ori))/length(union(b.pm, b.ori)) - }) - }) - - oc <- sapply(seq(1, length(ord)), function(i){ - sapply(seq(1, length(ord.pm)), function(j){ - b.ori <- ord[[i]] - b.pm <- ord.pm[[j]] - oc <- length(intersect(b.pm, b.ori))/min(length(b.pm), length(b.ori)) - }) - }) - colnames(oc) <- colnames(js) <- paste0('original', seq(1, length(ord))) - jslist[[pmid]] <- js - oclist[[pmid]] <- oc -} -saveRDS(jslist, '/Users/wenpinhou/Dropbox/trajectory_variability/tree_variability/result/pm_js.rds') -saveRDS(oclist, '/Users/wenpinhou/Dropbox/trajectory_variability/tree_variability/result/pm_oc.rds') - -jsm <- do.call(rbind, jslist) -ocm <- do.call(rbind, oclist) -par(mfrow = c(1,2)) -hist(jsm) -hist(ocm) - -res <- sapply(seq(1,length(jslist)), function(i){ - js <- jslist[[i]] - js.binary <- sapply(seq(1,ncol(js)), function(c){ - (js[,c] > js.cut[c]) + 0 - }) - dup.id <- which(rowSums(js.binary) > 1) - if (length(dup.id) == 1){ - addid <- which.max(js[dup.id, ]) - js.binary[dup.id, ] <- 0 - js.binary[dup.id, addid] <- 1 - } else if (length(dup.id) > 1) { - for (dup.i in dup.id){ - addid <- which.max(js[dup.i, ]) - js.binary[dup.i, ] <- 0 - js.binary[dup.i, addid] <- 1 - } - } - js.melt <- melt(js.binary) - js.melt <- js.melt[js.melt[,3]!=0,] - as.character(js.melt[,2]) -}) -res <- unlist(res) -js.perc <- table(res)/1e3 -saveRDS(js.perc, '/Users/wenpinhou/Dropbox/trajectory_variability/tree_variability/result/js_percentage.rds') - -res <- sapply(seq(1,length(oclist)), function(i){ - oc <- oclist[[i]] - oc.binary <- sapply(seq(1,ncol(oc)), function(c){ - (oc[,c] > oc.cut[c]) + 0 - }) - dup.id <- which(rowSums(oc.binary) > 1) - if (length(dup.id) == 1){ - addid <- which.max(oc[dup.id, ]) - oc.binary[dup.id, ] <- 0 - oc.binary[dup.id, addid] <- 1 - } else if (length(dup.id) > 1) { - for (dup.i in dup.id){ - addid <- which.max(oc[dup.i, ]) - oc.binary[dup.i, ] <- 0 - oc.binary[dup.i, addid] <- 1 - } - } - oc.melt <- melt(oc.binary) - oc.melt <- oc.melt[oc.melt[,3]!=0,] - as.character(oc.melt[,2]) -}) -res <- unlist(res) -oc.perc <- table(res)/1e3 -sort((js.perc + oc.perc)/2) -saveRDS(oc.perc, '/Users/wenpinhou/Dropbox/trajectory_variability/tree_variability/result/oc_percentage.rds') - - diff --git a/hca_bone_marrow_data_analysis/tree_variability/auto_pc_auto_clu_permute1e3_0.99QuantileCutoff_js_oc_corr/code/01_reproducibility.R b/hca_bone_marrow_data_analysis/tree_variability/auto_pc_auto_clu_permute1e3_0.99QuantileCutoff_js_oc_corr/code/01_reproducibility.R deleted file mode 100644 index 6a0319e..0000000 --- a/hca_bone_marrow_data_analysis/tree_variability/auto_pc_auto_clu_permute1e3_0.99QuantileCutoff_js_oc_corr/code/01_reproducibility.R +++ /dev/null @@ -1,274 +0,0 @@ -rm(list=ls()) -library(ggplot2) -library(Seurat) -library(reshape2) -library(TSCAN) -library(scattermore) -library(RColorBrewer) -suppressMessages(library(igraph)) -n.permute <- 1e3 -setwd("/Users/wenpinhou/Dropbox/trajectory_variability/hca/data/HCA/proc/integrate") -umap = readRDS('umap.rds') -pca <- as.matrix(umap@reductions$pca@cell.embeddings) -ctlevel <- data.frame(ct=c('HSC','MPP','LMPP','CMP','CLP','GMP','MEP',"Bcell","CD4Tcell","CD8Tcell",'NKcell','Mono','Ery'),level=c(1,2,3,3,4,4,4,5,5,5,5,5,5),immunepath=c(1,1,1,0,1,0,0,1,1,1,1,0,0),monopath=c(1,1,1,1,0,1,0,0,0,0,0,1,0),erypath=c(1,1,0,1,0,0,1,0,0,0,0,0,1),stringsAsFactors = F) -str(pca) -a = readRDS('/Users/wenpinhou/Dropbox/trajectory_variability/hca/data/HCA/proc/ct/sc.rds') -ct = data.frame(cell = names(a), celltype = a, stringsAsFactors = FALSE) - -mykmeans <- function(matrix, number.cluster = NA){ - ## cluster the rows - set.seed(12345) - library(parallel) - if (is.na(number.cluster)){ - maxclunum <- 20 - rss <- mclapply(1:maxclunum,function(clunum) { - tmp <- kmeans(matrix,clunum,iter.max = 1000) - tmp$betweenss/tmp$totss - },mc.cores=20) - rss <- unlist(rss) - x <- 1:maxclunum - optclunum <- which.min(sapply(1:maxclunum, function(i) { - x2 <- pmax(0, x - i) - sum(lm(rss ~ x + x2)$residuals^2) ## check this - })) - clu <- kmeans(matrix,optclunum) - } else { - clu <- kmeans(matrix, number.cluster) - } - return(clu) -} - -### determine numPC -set.seed(12345) -sdev <- apply(pca, 2, sd) -x <- 1:20 -optpoint <- which.min(sapply(2:20, function(i) { - x2 <- pmax(0, x - i) - sum(lm(sdev[1:20] ~ x + x2)$residuals^2) -})) -pcadim = optpoint + 1 -pr <- pca[,1:pcadim] # 2 - -### mclust -# mcl <- exprmclust(t(pr),cluster=clu,reduce=F) -mcl <- exprmclust(t(pr), reduce = F) -plotmclust(mcl, cell_point_size = 0.1) -str(mcl) -## find origin -pd = data.frame(x = pr[,1], y = pr[,2], clu = as.factor(mcl$clusterid)) -pd = cbind(pd, celltype = ct[match(rownames(pd), ct[,1]),2]) -tab <- table(pd[,3:4]) -tab <- tab/rowSums(tab) -pd <- melt(tab) -pd$clu <- factor(as.character(pd$clu), levels = seq(1,max(pd$clu))) - -tmp <- pd[pd$celltype == 'HSC', ] -origin.cluster <- as.numeric(tmp[which.max(tmp[,3]), 1]) - -## construct pseudotime -ord <- TSCANorder(mcl, startcluster = origin.cluster, listbranch = T,orderonly = T) -str(ord) -length(ord) -pt <- unlist(sapply(sapply(ord, length), function(i) seq(1, i))) -names(pt) <- unname(unlist(ord)) - - -## plot pseudotime -pd = data.frame(pc1 = pca[,1], pc2 = pca[,2], time = as.numeric(pt[rownames(pca)])) -library(scattermore) -library(RColorBrewer) -ggplot(data = pd, aes(x = pc1, y = pc2, color = time)) + - geom_scattermore() + - scale_color_gradient(low = 'yellow', high = 'blue') - -# ------------------------------------------------------- -# null distribution of Jaccard index, overlap coefficient -# ------------------------------------------------------- -js.null <- lapply(seq(1, length(ord)), function(i){ - b.ori <- ord[[i]] - tmp <- sapply(seq(1, 1e3), function(j){ - set.seed(j) - b.pm <- sample(rownames(pr), length(b.ori)) - length(intersect(b.pm, b.ori))/length(union(b.pm, b.ori)) - }) -}) - -par(mfrow = c(1,3)) -hist(js.null[[1]]) -hist(js.null[[2]]) -hist(js.null[[3]]) - -js.cut <- sapply(js.null, quantile, 0.99) - -oc.null <- lapply(seq(1, length(ord)), function(i){ - b.ori <- ord[[i]] - tmp <- sapply(seq(1, 1e3), function(j){ - set.seed(j) - b.pm <- sample(rownames(pr), length(b.ori)) - length(intersect(b.pm, b.ori))/min(length(b.pm), length(b.ori)) - }) -}) -par(mfrow = c(1,3)) -hist(oc.null[[1]]) -hist(oc.null[[2]]) -hist(oc.null[[3]]) - -oc.cut <- sapply(oc.null, quantile, 0.99) - -# ----------- -# permutation -# ----------- -corrlist <- jslist <- oclist <- list() -for (pmid in seq(1, n.permute)){ - ## boostrap cells - print(pmid) - set.seed(pmid) - pr.pm <- pr[sample(rownames(pr), nrow(pr), replace = TRUE),] - pr.pm <- pr.pm[!duplicated(rownames(pr.pm)),] - - # ## cluster cells - mcl.pm <- exprmclust(t(pr.pm), reduce = FALSE) ### - # plotmclust(mcl.pm, cell_point_size = 0.1) - - ## select origin cluster - pt.pm.mean<- tapply(pt[names(mcl.pm[['clusterid']])], list(mcl.pm[['clusterid']]), mean) - start.cluster <- names(which.min(pt.pm.mean)) - - ## construct pseudotime - ord.pm <- TSCANorder(mcl.pm, startcluster = start.cluster, listbranch = T,orderonly = T) - # str(ord.pm) - - ## plot pseudotime - pt.pm <- unlist(sapply(sapply(ord.pm, length), function(i) seq(1, i))) - names(pt.pm) <- unname(unlist(ord.pm)) - pd = data.frame(pc1 = pca[,1], pc2 = pca[,2], time = as.numeric(pt.pm[rownames(pca)])) - # ggplot(data = pd, aes(x = pc1, y = pc2, color = time)) + - # geom_scattermore() - - ## compare two MST - js <- sapply(seq(1, length(ord)), function(i){ - sapply(seq(1, length(ord.pm)), function(j){ - b.ori <- ord[[i]] - b.pm <- ord.pm[[j]] - js <- length(intersect(b.pm, b.ori))/length(union(b.pm, b.ori)) - }) - }) - oc <- sapply(seq(1, length(ord)), function(i){ - sapply(seq(1, length(ord.pm)), function(j){ - b.ori <- ord[[i]] - b.pm <- ord.pm[[j]] - oc <- length(intersect(b.pm, b.ori))/min(length(b.pm), length(b.ori)) - }) - }) - corr <- sapply(seq(1, length(ord)), function(i){ - sapply(seq(1, length(ord.pm)), function(j){ - ov = intersect(ord[[i]], ord.pm[[j]]) - cor(pt[ov], pt.pm[ov]) - }) - }) - corr[is.na(corr)] <- 0 - colnames(corr) <- colnames(oc) <- colnames(js) <- paste0('original', seq(1, length(ord))) - jslist[[pmid]] <- js - oclist[[pmid]] <- oc - corrlist[[pmid]] <- corr -} -saveRDS(jslist, '/Users/wenpinhou/Dropbox/trajectory_variability/tree_variability/result/pm_js.rds') -saveRDS(oclist, '/Users/wenpinhou/Dropbox/trajectory_variability/tree_variability/result/pm_oc.rds') - -saveRDS(corrlist, '/Users/wenpinhou/Dropbox/trajectory_variability/tree_variability/result/pm_oc.rds') - -jsm <- do.call(rbind, jslist) -ocm <- do.call(rbind, oclist) -par(mfrow = c(1,2)) -hist(jsm) -hist(ocm) - -res <- corr.score <- list() -for (i in seq(1, length(jslist))){ - js <- jslist[[i]] - js.binary <- sapply(seq(1,ncol(js)), function(c){ - (js[,c] > js.cut[c]) + 0 - }) - while (length(which(rowSums(js.binary) > 1)) > 0 | length(which(colSums(js.binary) > 1)) > 0){ - dup.id <- which(rowSums(js.binary) > 1) - if (length(dup.id) == 1){ - addid <- which.max(js[dup.id, ]) - js.binary[dup.id, ] <- 0 - js.binary[dup.id, addid] <- 1 - } else if (length(dup.id) > 1) { - for (dup.i in dup.id){ - addid <- which.max(js[dup.i, ]) - js.binary[dup.i, ] <- 0 - js.binary[dup.i, addid] <- 1 - } - } - - dup.id <- which(colSums(js.binary) > 1) - if (length(dup.id) == 1){ - addid <- which.max(js[, dup.id]) - js.binary[dup.id, ] <- 0 - js.binary[addid, dup.id] <- 1 - } else if (length(dup.id) > 1) { - for (dup.i in dup.id){ - addid <- which.max(js[, dup.id]) - js.binary[, dup.id] <- 0 - js.binary[addid, dup.id] <- 1 - } - } - } - - - corr.score[[i]] <- corrlist[[i]] * js.binary - js.melt <- melt(js.binary) - js.melt <- js.melt[js.melt[,3]!=0,] - res[[i]] <- as.character(js.melt[,2]) -} -res <- unlist(res) -js.perc <- table(res)/n.permute -saveRDS(js.perc, '/Users/wenpinhou/Dropbox/trajectory_variability/tree_variability/result/js_percentage.rds') - -corr.score.m <- do.call(rbind, corr.score) -corr.score.v <- colSums(corr.score.m)/n.permute -saveRDS(corr.score.v, '/Users/wenpinhou/Dropbox/trajectory_variability/tree_variability/result/corr_score.rds') - -res <- sapply(seq(1,length(oclist)), function(i){ - oc <- oclist[[i]] - oc.binary <- sapply(seq(1,ncol(oc)), function(c){ - (oc[,c] > oc.cut[c]) + 0 - }) - while (length(which(rowSums(oc.binary) > 1)) > 0 | length(which(colSums(oc.binary) > 1)) > 0){ - dup.id <- which(rowSums(oc.binary) > 1) - if (length(dup.id) == 1){ - addid <- which.max(oc[dup.id, ]) - oc.binary[dup.id, ] <- 0 - oc.binary[dup.id, addid] <- 1 - } else if (length(dup.id) > 1) { - for (dup.i in dup.id){ - addid <- which.max(oc[dup.i, ]) - oc.binary[dup.i, ] <- 0 - oc.binary[dup.i, addid] <- 1 - } - } - dup.id <- which(colSums(oc.binary) > 1) - if (length(dup.id) == 1){ - addid <- which.max(oc[, dup.id]) - oc.binary[, dup.id] <- 0 - oc.binary[addid, dup.id] <- 1 - } else if (length(dup.id) > 1) { - for (dup.i in dup.id){ - addid <- which.max(oc[, dup.i]) - oc.binary[, dup.i] <- 0 - oc.binary[addid, dup.i] <- 1 - } - } - } - oc.melt <- melt(oc.binary) - oc.melt <- oc.melt[oc.melt[,3]!=0,] - as.character(oc.melt[,2]) -}) -res <- unlist(res) -oc.perc <- table(res)/n.permute -sort((js.perc + oc.perc)/2) -saveRDS(oc.perc, '/Users/wenpinhou/Dropbox/trajectory_variability/tree_variability/result/oc_percentage.rds') - - diff --git a/hca_bone_marrow_data_analysis/tree_variability/auto_pc_auto_nclu_module/code/01_reproducibility.R b/hca_bone_marrow_data_analysis/tree_variability/auto_pc_auto_nclu_module/code/01_reproducibility.R deleted file mode 100644 index 6a0319e..0000000 --- a/hca_bone_marrow_data_analysis/tree_variability/auto_pc_auto_nclu_module/code/01_reproducibility.R +++ /dev/null @@ -1,274 +0,0 @@ -rm(list=ls()) -library(ggplot2) -library(Seurat) -library(reshape2) -library(TSCAN) -library(scattermore) -library(RColorBrewer) -suppressMessages(library(igraph)) -n.permute <- 1e3 -setwd("/Users/wenpinhou/Dropbox/trajectory_variability/hca/data/HCA/proc/integrate") -umap = readRDS('umap.rds') -pca <- as.matrix(umap@reductions$pca@cell.embeddings) -ctlevel <- data.frame(ct=c('HSC','MPP','LMPP','CMP','CLP','GMP','MEP',"Bcell","CD4Tcell","CD8Tcell",'NKcell','Mono','Ery'),level=c(1,2,3,3,4,4,4,5,5,5,5,5,5),immunepath=c(1,1,1,0,1,0,0,1,1,1,1,0,0),monopath=c(1,1,1,1,0,1,0,0,0,0,0,1,0),erypath=c(1,1,0,1,0,0,1,0,0,0,0,0,1),stringsAsFactors = F) -str(pca) -a = readRDS('/Users/wenpinhou/Dropbox/trajectory_variability/hca/data/HCA/proc/ct/sc.rds') -ct = data.frame(cell = names(a), celltype = a, stringsAsFactors = FALSE) - -mykmeans <- function(matrix, number.cluster = NA){ - ## cluster the rows - set.seed(12345) - library(parallel) - if (is.na(number.cluster)){ - maxclunum <- 20 - rss <- mclapply(1:maxclunum,function(clunum) { - tmp <- kmeans(matrix,clunum,iter.max = 1000) - tmp$betweenss/tmp$totss - },mc.cores=20) - rss <- unlist(rss) - x <- 1:maxclunum - optclunum <- which.min(sapply(1:maxclunum, function(i) { - x2 <- pmax(0, x - i) - sum(lm(rss ~ x + x2)$residuals^2) ## check this - })) - clu <- kmeans(matrix,optclunum) - } else { - clu <- kmeans(matrix, number.cluster) - } - return(clu) -} - -### determine numPC -set.seed(12345) -sdev <- apply(pca, 2, sd) -x <- 1:20 -optpoint <- which.min(sapply(2:20, function(i) { - x2 <- pmax(0, x - i) - sum(lm(sdev[1:20] ~ x + x2)$residuals^2) -})) -pcadim = optpoint + 1 -pr <- pca[,1:pcadim] # 2 - -### mclust -# mcl <- exprmclust(t(pr),cluster=clu,reduce=F) -mcl <- exprmclust(t(pr), reduce = F) -plotmclust(mcl, cell_point_size = 0.1) -str(mcl) -## find origin -pd = data.frame(x = pr[,1], y = pr[,2], clu = as.factor(mcl$clusterid)) -pd = cbind(pd, celltype = ct[match(rownames(pd), ct[,1]),2]) -tab <- table(pd[,3:4]) -tab <- tab/rowSums(tab) -pd <- melt(tab) -pd$clu <- factor(as.character(pd$clu), levels = seq(1,max(pd$clu))) - -tmp <- pd[pd$celltype == 'HSC', ] -origin.cluster <- as.numeric(tmp[which.max(tmp[,3]), 1]) - -## construct pseudotime -ord <- TSCANorder(mcl, startcluster = origin.cluster, listbranch = T,orderonly = T) -str(ord) -length(ord) -pt <- unlist(sapply(sapply(ord, length), function(i) seq(1, i))) -names(pt) <- unname(unlist(ord)) - - -## plot pseudotime -pd = data.frame(pc1 = pca[,1], pc2 = pca[,2], time = as.numeric(pt[rownames(pca)])) -library(scattermore) -library(RColorBrewer) -ggplot(data = pd, aes(x = pc1, y = pc2, color = time)) + - geom_scattermore() + - scale_color_gradient(low = 'yellow', high = 'blue') - -# ------------------------------------------------------- -# null distribution of Jaccard index, overlap coefficient -# ------------------------------------------------------- -js.null <- lapply(seq(1, length(ord)), function(i){ - b.ori <- ord[[i]] - tmp <- sapply(seq(1, 1e3), function(j){ - set.seed(j) - b.pm <- sample(rownames(pr), length(b.ori)) - length(intersect(b.pm, b.ori))/length(union(b.pm, b.ori)) - }) -}) - -par(mfrow = c(1,3)) -hist(js.null[[1]]) -hist(js.null[[2]]) -hist(js.null[[3]]) - -js.cut <- sapply(js.null, quantile, 0.99) - -oc.null <- lapply(seq(1, length(ord)), function(i){ - b.ori <- ord[[i]] - tmp <- sapply(seq(1, 1e3), function(j){ - set.seed(j) - b.pm <- sample(rownames(pr), length(b.ori)) - length(intersect(b.pm, b.ori))/min(length(b.pm), length(b.ori)) - }) -}) -par(mfrow = c(1,3)) -hist(oc.null[[1]]) -hist(oc.null[[2]]) -hist(oc.null[[3]]) - -oc.cut <- sapply(oc.null, quantile, 0.99) - -# ----------- -# permutation -# ----------- -corrlist <- jslist <- oclist <- list() -for (pmid in seq(1, n.permute)){ - ## boostrap cells - print(pmid) - set.seed(pmid) - pr.pm <- pr[sample(rownames(pr), nrow(pr), replace = TRUE),] - pr.pm <- pr.pm[!duplicated(rownames(pr.pm)),] - - # ## cluster cells - mcl.pm <- exprmclust(t(pr.pm), reduce = FALSE) ### - # plotmclust(mcl.pm, cell_point_size = 0.1) - - ## select origin cluster - pt.pm.mean<- tapply(pt[names(mcl.pm[['clusterid']])], list(mcl.pm[['clusterid']]), mean) - start.cluster <- names(which.min(pt.pm.mean)) - - ## construct pseudotime - ord.pm <- TSCANorder(mcl.pm, startcluster = start.cluster, listbranch = T,orderonly = T) - # str(ord.pm) - - ## plot pseudotime - pt.pm <- unlist(sapply(sapply(ord.pm, length), function(i) seq(1, i))) - names(pt.pm) <- unname(unlist(ord.pm)) - pd = data.frame(pc1 = pca[,1], pc2 = pca[,2], time = as.numeric(pt.pm[rownames(pca)])) - # ggplot(data = pd, aes(x = pc1, y = pc2, color = time)) + - # geom_scattermore() - - ## compare two MST - js <- sapply(seq(1, length(ord)), function(i){ - sapply(seq(1, length(ord.pm)), function(j){ - b.ori <- ord[[i]] - b.pm <- ord.pm[[j]] - js <- length(intersect(b.pm, b.ori))/length(union(b.pm, b.ori)) - }) - }) - oc <- sapply(seq(1, length(ord)), function(i){ - sapply(seq(1, length(ord.pm)), function(j){ - b.ori <- ord[[i]] - b.pm <- ord.pm[[j]] - oc <- length(intersect(b.pm, b.ori))/min(length(b.pm), length(b.ori)) - }) - }) - corr <- sapply(seq(1, length(ord)), function(i){ - sapply(seq(1, length(ord.pm)), function(j){ - ov = intersect(ord[[i]], ord.pm[[j]]) - cor(pt[ov], pt.pm[ov]) - }) - }) - corr[is.na(corr)] <- 0 - colnames(corr) <- colnames(oc) <- colnames(js) <- paste0('original', seq(1, length(ord))) - jslist[[pmid]] <- js - oclist[[pmid]] <- oc - corrlist[[pmid]] <- corr -} -saveRDS(jslist, '/Users/wenpinhou/Dropbox/trajectory_variability/tree_variability/result/pm_js.rds') -saveRDS(oclist, '/Users/wenpinhou/Dropbox/trajectory_variability/tree_variability/result/pm_oc.rds') - -saveRDS(corrlist, '/Users/wenpinhou/Dropbox/trajectory_variability/tree_variability/result/pm_oc.rds') - -jsm <- do.call(rbind, jslist) -ocm <- do.call(rbind, oclist) -par(mfrow = c(1,2)) -hist(jsm) -hist(ocm) - -res <- corr.score <- list() -for (i in seq(1, length(jslist))){ - js <- jslist[[i]] - js.binary <- sapply(seq(1,ncol(js)), function(c){ - (js[,c] > js.cut[c]) + 0 - }) - while (length(which(rowSums(js.binary) > 1)) > 0 | length(which(colSums(js.binary) > 1)) > 0){ - dup.id <- which(rowSums(js.binary) > 1) - if (length(dup.id) == 1){ - addid <- which.max(js[dup.id, ]) - js.binary[dup.id, ] <- 0 - js.binary[dup.id, addid] <- 1 - } else if (length(dup.id) > 1) { - for (dup.i in dup.id){ - addid <- which.max(js[dup.i, ]) - js.binary[dup.i, ] <- 0 - js.binary[dup.i, addid] <- 1 - } - } - - dup.id <- which(colSums(js.binary) > 1) - if (length(dup.id) == 1){ - addid <- which.max(js[, dup.id]) - js.binary[dup.id, ] <- 0 - js.binary[addid, dup.id] <- 1 - } else if (length(dup.id) > 1) { - for (dup.i in dup.id){ - addid <- which.max(js[, dup.id]) - js.binary[, dup.id] <- 0 - js.binary[addid, dup.id] <- 1 - } - } - } - - - corr.score[[i]] <- corrlist[[i]] * js.binary - js.melt <- melt(js.binary) - js.melt <- js.melt[js.melt[,3]!=0,] - res[[i]] <- as.character(js.melt[,2]) -} -res <- unlist(res) -js.perc <- table(res)/n.permute -saveRDS(js.perc, '/Users/wenpinhou/Dropbox/trajectory_variability/tree_variability/result/js_percentage.rds') - -corr.score.m <- do.call(rbind, corr.score) -corr.score.v <- colSums(corr.score.m)/n.permute -saveRDS(corr.score.v, '/Users/wenpinhou/Dropbox/trajectory_variability/tree_variability/result/corr_score.rds') - -res <- sapply(seq(1,length(oclist)), function(i){ - oc <- oclist[[i]] - oc.binary <- sapply(seq(1,ncol(oc)), function(c){ - (oc[,c] > oc.cut[c]) + 0 - }) - while (length(which(rowSums(oc.binary) > 1)) > 0 | length(which(colSums(oc.binary) > 1)) > 0){ - dup.id <- which(rowSums(oc.binary) > 1) - if (length(dup.id) == 1){ - addid <- which.max(oc[dup.id, ]) - oc.binary[dup.id, ] <- 0 - oc.binary[dup.id, addid] <- 1 - } else if (length(dup.id) > 1) { - for (dup.i in dup.id){ - addid <- which.max(oc[dup.i, ]) - oc.binary[dup.i, ] <- 0 - oc.binary[dup.i, addid] <- 1 - } - } - dup.id <- which(colSums(oc.binary) > 1) - if (length(dup.id) == 1){ - addid <- which.max(oc[, dup.id]) - oc.binary[, dup.id] <- 0 - oc.binary[addid, dup.id] <- 1 - } else if (length(dup.id) > 1) { - for (dup.i in dup.id){ - addid <- which.max(oc[, dup.i]) - oc.binary[, dup.i] <- 0 - oc.binary[addid, dup.i] <- 1 - } - } - } - oc.melt <- melt(oc.binary) - oc.melt <- oc.melt[oc.melt[,3]!=0,] - as.character(oc.melt[,2]) -}) -res <- unlist(res) -oc.perc <- table(res)/n.permute -sort((js.perc + oc.perc)/2) -saveRDS(oc.perc, '/Users/wenpinhou/Dropbox/trajectory_variability/tree_variability/result/oc_percentage.rds') - - diff --git a/hca_bone_marrow_data_analysis/tree_variability/auto_pc_auto_nclu_module/code/02_samples_reproducibility.R b/hca_bone_marrow_data_analysis/tree_variability/auto_pc_auto_nclu_module/code/02_samples_reproducibility.R deleted file mode 100644 index 82056b3..0000000 --- a/hca_bone_marrow_data_analysis/tree_variability/auto_pc_auto_nclu_module/code/02_samples_reproducibility.R +++ /dev/null @@ -1,452 +0,0 @@ -rm(list=ls()) -library(ggplot2) -library(Seurat) -library(reshape2) -library(TSCAN) -library(scattermore) -library(RColorBrewer) -suppressMessages(library(igraph)) -n.permute <- 1e3 -max.clunum <- 50 -setwd("/Users/wenpinhou/Dropbox/trajectory_variability/hca/data/HCA/proc/integrate") -umap = readRDS('umap.rds') -pca <- as.matrix(umap@reductions$pca@cell.embeddings) -ctlevel <- data.frame(ct=c('HSC','MPP','LMPP','CMP','CLP','GMP','MEP',"Bcell","CD4Tcell","CD8Tcell",'NKcell','Mono','Ery'),level=c(1,2,3,3,4,4,4,5,5,5,5,5,5),immunepath=c(1,1,1,0,1,0,0,1,1,1,1,0,0),monopath=c(1,1,1,1,0,1,0,0,0,0,0,1,0),erypath=c(1,1,0,1,0,0,1,0,0,0,0,0,1),stringsAsFactors = F) -str(pca) -a = readRDS('/Users/wenpinhou/Dropbox/trajectory_variability/hca/data/HCA/proc/ct/sc.rds') -ct = data.frame(cell = names(a), celltype = a, stringsAsFactors = FALSE) - -mykmeans <- function(matrix, number.cluster = NA){ - ## cluster the rows - set.seed(12345) - library(parallel) - if (is.na(number.cluster)){ - maxclunum <- 20 - rss <- mclapply(1:maxclunum,function(clunum) { - tmp <- kmeans(matrix,clunum,iter.max = 1000) - tmp$betweenss/tmp$totss - },mc.cores=20) - rss <- unlist(rss) - x <- 1:maxclunum - optclunum <- which.min(sapply(1:maxclunum, function(i) { - x2 <- pmax(0, x - i) - sum(lm(rss ~ x + x2)$residuals^2) ## check this - })) - clu <- kmeans(matrix,optclunum) - } else { - clu <- kmeans(matrix, number.cluster) - } - return(clu) -} - -### determine numPC -set.seed(12345) -sdev <- apply(pca, 2, sd) -x <- 1:max.clunum -optpoint <- which.min(sapply(2:max.clunum, function(i) { - x2 <- pmax(0, x - i) - sum(lm(sdev[1:max.clunum] ~ x + x2)$residuals^2) -})) -pcadim = optpoint + 1 -pr <- pca[,1:pcadim] # 7 - -## clustering -clu <- mykmeans(pr, number.cluster = 14)$cluster -pd = data.frame(x = pr[,1], y = pr[,2], clu = as.factor(clu[rownames(pr)])) - -# mypalette = colorRampPalette(brewer.pal(9,'Set1')) -# ggplot(data = pd, aes(x = x, y = y, color = clu)) + -# geom_scattermore()+ -# scale_color_manual(values = mypalette(14))+ -# theme_classic() + xlab('UMAP1') + ylab('UMAP2') - -## cell type composition in clusters -pd = cbind(pd, celltype = ct[match(rownames(pd), ct[,1]),2]) -tab <- table(pd[,3:4]) -tab <- tab/rowSums(tab) -pd <- melt(tab) -pd$clu <- factor(as.character(pd$clu), levels = seq(1,max(pd$clu))) - -# ggplot(data = pd) + -# geom_bar(aes(x = clu, y = value, fill = celltype), stat = 'identity', position = 'dodge') + -# theme_classic() + -# ylab('Celltype Proportion') + -# scale_fill_manual(values = mypalette(length(unique(pd$celltype)))) - -### mclust -mcl <- exprmclust(t(pr),cluster=clu,reduce=F) -# mcl <- exprmclust(t(pr), reduce = F) -# plotmclust(mcl, cell_point_size = 0.1) -# str(mcl) - -# -------------------- -# construct pseudotime -# -------------------- -## find origin -pd = data.frame(x = pr[,1], y = pr[,2], clu = as.factor(mcl$clusterid)) -pd = cbind(pd, celltype = ct[match(rownames(pd), ct[,1]),2]) -tab <- table(pd[,3:4]) -tab <- tab/rowSums(tab) -pd <- melt(tab) -pd$clu <- factor(as.character(pd$clu), levels = seq(1,max(pd$clu))) -tmp <- pd[pd$celltype == 'HSC', ] -origin.cluster <- as.numeric(tmp[which.max(tmp[,3]), 1]) - -## construct pseudotime -ord <- TSCANorder(mcl, startcluster = origin.cluster, listbranch = T,orderonly = T) -str(ord) -length(ord) -pt <- unlist(sapply(sapply(ord, length), function(i) seq(1, i))) -names(pt) <- unname(unlist(ord)) - -# ## plot pseudotime -# pd = data.frame(pc1 = pca[,1], pc2 = pca[,2], time = as.numeric(pt[rownames(pca)])) -# library(scattermore) -# library(RColorBrewer) -# ggplot(data = pd, aes(x = pc1, y = pc2, color = time)) + -# geom_scattermore() + -# scale_color_gradient(low = 'yellow', high = 'blue') - -# ------------------------------------------------------------ -# get candidate branches to test reproducibility, 20200726 >> -# ------------------------------------------------------------ -findbranch <- function(mst, order, origin){ - deg <- degree(mst) - vertex <- names(deg[which(deg > 2 | deg == 1)]) - if (!origin %in% vertex) vertex <- c(origin, vertex) - eg <- expand.grid(1:length(vertex), 1:length(vertex)) - eg <- eg[eg[,1]0] - - allbranch <- gsub('backbone ', '', gsub('branch: ', '', names(order))) - allbranch <- sapply(allbranch, function(i) strsplit(i, ',')[[1]]) - allbranch <- paste0(names(allbranch), collapse = ' ') - newbranch <-sapply(tmpbranch, function(i) { - tmp <- paste0(i, collapse = ',') - if (!grepl(tmp, allbranch)){ - rev(i) - } else { - i - } - }) - return(newbranch) -} -newbranch <- findbranch(mst = mcl$MSTtree, order = ord, origin = origin.cluster) - - -# ------------------------------------------------------- -# null distribution of Jaccard index, overlap coefficient -# ------------------------------------------------------- -## add here --------------->>>>>> -## for samples -## add here ---------------<<<<<<< -js.null <- lapply(seq(1, length(newbranch)), function(i){ - b.ori <- unlist(sapply(newbranch[[i]], function(c) names(mcl$clusterid[mcl$clusterid == c]))) - b.ori.alls <- gsub(':.*', '', b.ori) - alls <- gsub(':.*', '', rownames(pr)) - tmp <- mclapply(seq(1, 1e3), function(j){ - set.seed(j) - b.pm <- sample(rownames(pr), length(b.ori)) - b.pm.alls <- gsub(':.*', '', b.pm) - tmpp <- sapply(unique(alls), function(s){ - b.pm.s <- b.pm[b.pm.alls == s] - b.ori.s <- b.ori[b.ori.alls == s] - length(intersect(b.pm.s, b.ori.s))/length(union(b.pm.s, b.ori.s)) - }) - },mc.cores = detectCores()-2) - tmp <- do.call(rbind,tmp) -}) -js.cut <- sapply(js.null, function(i) apply(i, 2, quantile, 0.99)) -# ------------------ - -oc.null <- lapply(seq(1, length(newbranch)), function(i){ - b.ori <- unlist(sapply(newbranch[[i]], function(c) names(mcl$clusterid[mcl$clusterid == c]))) - b.ori.alls <- gsub(':.*', '', b.ori) - tmp <- mclapply(seq(1, 1e3), function(j){ - set.seed(j) - b.pm <- sample(rownames(pr), length(b.ori)) - b.pm.alls <- gsub(':.*', '', b.pm) - tmpp <- sapply(unique(alls), function(s){ - b.pm.s <- b.pm[b.pm.alls == s] - b.ori.s <- b.ori[b.ori.alls == s] - length(intersect(b.pm.s, b.ori.s))/min(length(b.pm.s), length(b.ori.s)) - }) - },mc.cores = detectCores()-2) - tmp <- do.call(rbind,tmp) -}) -oc.cut <- sapply(oc.null, function(i) apply(i, 2, quantile, 0.99)) - -# ----------- -# permutation -# ----------- -corrlist.alls <- jslist.alls <- oclist.alls <- list() -n.permute = 100 -for (pmid in seq(1, n.permute)){ - ## boostrap cells - print(pmid) - set.seed(pmid) - pr.pm <- pr[sample(rownames(pr), nrow(pr), replace = TRUE),] - pr.pm <- pr.pm[!duplicated(rownames(pr.pm)),] - - # ## cluster cells - clu <- mykmeans(pr.pm, number.cluster = 14)$cluster ### - - pd = data.frame(x = pr[names(clu),1], y = pr[names(clu),2], clu = as.factor(clu)) - pd.text.x = tapply(pd[,1], list(pd$clu), mean) - pd.text.y = tapply(pd[,2], list(pd$clu), mean) - pd.text = data.frame(x = pd.text.x, y = pd.text.y, clu = names(pd.text.x)) - pd.text[14,1:2] = c(pd.text[14,1] + 2, pd.text[14,2] + 1) - - # ggplot() + - # geom_scattermore(data = pd, aes(x = x, y = y, color = clu))+ - # scale_color_manual(values = mypalette(14))+ - # theme_classic() + xlab('UMAP1') + ylab('UMAP2') + - # geom_text(data = pd.text, aes(x = x, y = y, label = clu)) - - - ## cell type composition in clusters - # pd = cbind(pd, celltype = ct[match(rownames(pd), ct[,1]),2]) - # tab <- table(pd[,3:4]) - # tab <- tab/rowSums(tab) - # pd <- melt(tab) - # pd$clu <- factor(as.character(pd$clu), levels = seq(1,max(pd$clu))) - # ggplot(data = pd) + - # geom_bar(aes(x = clu, y = value, fill = celltype), stat = 'identity', position = 'dodge') + - # theme_classic() + - # ylab('Celltype Proportion') + - # scale_fill_manual(values = mypalette(length(unique(pd$celltype)))) - - # build pseudotime - mcl.pm <- exprmclust(t(pr.pm), cluster = clu, reduce = FALSE) ### - # plotmclust(mcl.pm, cell_point_size = 0.1) - - ## select origin cluster - pt.pm.mean<- tapply(pt[names(mcl.pm[['clusterid']])], list(mcl.pm[['clusterid']]), mean) - start.cluster <- names(which.min(pt.pm.mean)) - - ## construct pseudotime - ord.pm <- TSCANorder(mcl.pm, startcluster = start.cluster, listbranch = T,orderonly = T) - # str(ord.pm) - - ## plot pseudotime - pt.pm <- unlist(sapply(sapply(ord.pm, length), function(i) seq(1, i))) - names(pt.pm) <- unname(unlist(ord.pm)) - pd = data.frame(pc1 = pca[,1], pc2 = pca[,2], time = as.numeric(pt.pm[rownames(pca)])) - # ggplot(data = pd, aes(x = pc1, y = pc2, color = time)) + - # geom_scattermore() + theme_classic() - - # get candidate branches - newbranch.pm <- findbranch(mst = mcl.pm$MSTtree, order = ord.pm, origin = start.cluster) - - ## compare two MST - js <- sapply(seq(1, length(newbranch)), function(i){ - print('i') - print(i) - id <- which(sapply(paste0(names(ord),','), function(k) grepl(paste0(paste0(newbranch[[i]], collapse = ','),','), k)))[1] - cells <- ord[[id]] - b.ori <- intersect(unlist(sapply(newbranch[[i]], function(k) names(mcl$clusterid)[mcl$clusterid == k])), cells) - b.ori.alls <- gsub(':.*', '', b.ori) - tmp <- mclapply(seq(1, length(newbranch.pm)), function(j){ - print(j) - id <- which(sapply(paste0(names(ord.pm),','), function(k) grepl(paste0(paste0(newbranch.pm[[j]], collapse = ','),','), k)))[1] - cells <- ord.pm[[id]] - b.pm <- intersect(unlist(sapply(newbranch.pm[[j]], function(k) names(mcl.pm$clusterid)[mcl.pm$clusterid == k])), cells) - b.pm.alls <- gsub(':.*', '', b.pm) - # js <- length(intersect(b.pm, b.ori))/length(union(b.pm, b.ori)) - tmpp <- sapply(unique(alls), function(s){ - b.pm.s <- b.pm[b.pm.alls == s] - b.ori.s <- b.ori[b.ori.alls == s] - length(intersect(b.pm.s, b.ori.s))/length(union(b.pm.s, b.ori.s)) - }) - },mc.cores = detectCores()-2) - tmp <- do.call(rbind,tmp) - rownames(tmp) <- paste0('branch.pm', seq(1, length(newbranch.pm))) - tmp - }, simplify = FALSE) - names(js) <- paste0('branch', seq(1, length(newbranch))) - ###### ===================================== - oc <- sapply(seq(1, length(newbranch)), function(i){ - id <- which(sapply(paste0(names(ord),','), function(k) grepl(paste0(paste0(newbranch[[i]], collapse = ','),','), k)))[1] - cells <- ord[[id]] - b.ori <- intersect(unlist(sapply(newbranch[[i]], function(k) names(mcl$clusterid)[mcl$clusterid == k])), cells) - b.ori.alls <- gsub(':.*', '', b.ori) - tmp <- mclapply(seq(1, length(newbranch.pm)), function(j){ - id <- which(sapply(paste0(names(ord.pm),','), function(k) grepl(paste0(paste0(newbranch.pm[[j]], collapse = ','),','), k)))[1] - cells <- ord.pm[[id]] - b.pm <- intersect(unlist(sapply(newbranch.pm[[j]], function(k) names(mcl.pm$clusterid)[mcl.pm$clusterid == k])), cells) - b.pm.alls <- gsub(':.*', '', b.pm) - # oc <- length(intersect(b.pm, b.ori))/min(length(b.pm), length(b.ori)) - tmpp <- sapply(unique(alls), function(s){ - b.pm.s <- b.pm[b.pm.alls == s] - b.ori.s <- b.ori[b.ori.alls == s] - length(intersect(b.pm.s, b.ori.s))/min(length(b.pm.s), length(b.ori.s)) - }) - },mc.cores = detectCores()-2) - tmp <- do.call(rbind,tmp) - rownames(tmp) <- paste0('branch.pm', seq(1, length(newbranch.pm))) - tmp - }, simplify = FALSE) - names(oc) <- paste0('branch', seq(1, length(newbranch))) - - - corr <- sapply(seq(1, length(newbranch)), function(i){ - id <- which(sapply(paste0(names(ord),','), function(k) grepl(paste0(paste0(newbranch[[i]], collapse = ','),','), k)))[1] - cells <- ord[[id]] - b.ori <- intersect(unlist(sapply(newbranch[[i]], function(k) names(mcl$clusterid)[mcl$clusterid == k])), cells) - b.ori.alls <- gsub(':.*', '', b.ori) - tmp <- mclapply(seq(1, length(newbranch.pm)), function(j){ - id <- which(sapply(paste0(names(ord.pm),','), function(k) grepl(paste0(paste0(newbranch.pm[[j]], collapse = ','),','), k)))[1] - cells <- ord.pm[[id]] - b.pm <- intersect(unlist(sapply(newbranch.pm[[j]], function(k) names(mcl.pm$clusterid)[mcl.pm$clusterid == k])), cells) - b.pm.alls <- gsub(':.*', '', b.pm) - # ov = intersect(b.ori, b.pm) - # cor(pt[ov], pt.pm[ov]) - tmpp <- sapply(unique(alls), function(s){ - b.pm.s <- b.pm[b.pm.alls == s] - b.ori.s <- b.ori[b.ori.alls == s] - ov = intersect(b.ori.s, b.pm.s) - cor(pt[ov], pt.pm[ov]) - }) - }, mc.cores = detectCores()-2) - tmp <- do.call(rbind, tmp) - rownames(tmp) <- paste0('branch.pm', seq(1, length(newbranch.pm))) - tmp[is.na(tmp)] <- 0 - tmp - }, simplify = FALSE) - # corr[is.na(corr)] <- 0 - names(corr) <- paste0('branch', seq(1, length(newbranch))) - # colnames(corr) <- colnames(oc) <- colnames(js) <- paste0('original', seq(1, length(newbranch))) - jslist.alls[[pmid]] <- js - oclist.alls[[pmid]] <- oc - corrlist.alls[[pmid]] <- corr -} -saveRDS(jslist.alls, '/Users/wenpinhou/Dropbox/trajectory_variability/tree_variability/result/samples/pm_js_alls.rds') -saveRDS(oclist.alls, '/Users/wenpinhou/Dropbox/trajectory_variability/tree_variability/result/samples/pm_oc_alls.rds') -saveRDS(corrlist.alls, '/Users/wenpinhou/Dropbox/trajectory_variability/tree_variability/samples/result/pm_oc_alls.rds') - -# jsm <- do.call(rbind, jslist) -# ocm <- do.call(rbind, oclist) -# par(mfrow = c(1,2)) -# hist(jsm) -# hist(ocm) -s = unique(alls)[1] -df.alls <- lapply(unique(alls), function(s){ - jslist = sapply(jslist.alls, function(i){ - sapply(i, function(ii) ii[,s]) - }, simplify = FALSE) - oclist = sapply(oclist.alls, function(i){ - sapply(i, function(ii) ii[,s]) - }, simplify = FALSE) - corrlist = sapply(corrlist.alls, function(i){ - sapply(i, function(ii) ii[,s]) - }, simplify = FALSE) - - res <- corr.score <- list() - for (i in seq(1, length(jslist))){ - print(i) - js <- jslist[[i]] - js.binary <- sapply(seq(1,ncol(js)), function(c){ - (js[,c] > js.cut[c]) + 0 - }) - while (length(which(rowSums(js.binary) > 1)) > 0 | length(which(colSums(js.binary) > 1)) > 0){ - dup.id <- which(rowSums(js.binary) > 1) - if (length(dup.id) == 1){ - addid <- which.max(js[dup.id, ]) - js.binary[dup.id, ] <- 0 - js.binary[dup.id, addid] <- 1 - } else if (length(dup.id) > 1) { - for (dup.i in dup.id){ - print(dup.i) - addid <- which.max(js[dup.i, ]) - js.binary[dup.i, ] <- 0 - js.binary[dup.i, addid] <- 1 - } - } - - dup.id <- which(colSums(js.binary) > 1) - if (length(dup.id) == 1){ - addid <- which.max(js[, dup.id]) - js.binary[, dup.id] <- 0 - js.binary[addid, dup.id] <- 1 - } else if (length(dup.id) > 1) { - for (dup.i in dup.id){ - addid <- which.max(js[, dup.i]) - js.binary[, dup.i] <- 0 - js.binary[addid, dup.i] <- 1 - } - } - } - - corr.score[[i]] <- corrlist[[i]] * js.binary - js.melt <- melt(js.binary) - js.melt <- js.melt[js.melt[,3]!=0,] - res[[i]] <- as.character(js.melt[,2]) - } - res <- unlist(res) - js.perc <- table(res)/n.permute - names(js.perc) <- newbranch - # saveRDS(js.perc, '/Users/wenpinhou/Dropbox/trajectory_variability/tree_variability/result/samples/js_percentage.rds') - - corr.score.m <- do.call(rbind, corr.score) - corr.score.v <- colSums(corr.score.m)/n.permute - names(corr.score.v) <- newbranch - # saveRDS(corr.score.v, '/Users/wenpinhou/Dropbox/trajectory_variability/tree_variability/result/samples/corr_score.rds') - - res <- sapply(seq(1,length(oclist)), function(i){ - print(i) - oc <- oclist[[i]] - oc.binary <- sapply(seq(1,ncol(oc)), function(c){ - (oc[,c] > oc.cut[c]) + 0 - }) - while (length(which(rowSums(oc.binary) > 1)) > 0 | length(which(colSums(oc.binary) > 1)) > 0){ - dup.id <- which(rowSums(oc.binary) > 1) - if (length(dup.id) == 1){ - addid <- which.max(oc[dup.id, ]) - oc.binary[dup.id, ] <- 0 - oc.binary[dup.id, addid] <- 1 - } else if (length(dup.id) > 1) { - for (dup.i in dup.id){ - addid <- which.max(oc[dup.i, ]) - oc.binary[dup.i, ] <- 0 - oc.binary[dup.i, addid] <- 1 - } - } - dup.id <- which(colSums(oc.binary) > 1) - if (length(dup.id) == 1){ - addid <- which.max(oc[, dup.id]) - oc.binary[, dup.id] <- 0 - oc.binary[addid, dup.id] <- 1 - } else if (length(dup.id) > 1) { - for (dup.i in dup.id){ - addid <- which.max(oc[, dup.i]) - oc.binary[, dup.i] <- 0 - oc.binary[addid, dup.i] <- 1 - } - } - } - oc.melt <- melt(oc.binary) - oc.melt <- oc.melt[oc.melt[,3]!=0,] - as.character(oc.melt[,2]) - }) - res <- unlist(res) - oc.perc <- table(res)/n.permute - names(oc.perc) <- newbranch - sort((js.perc + oc.perc)/2) - - df <- data.frame(js.perc = js.perc, oc.perc = oc.perc, corr.score.v = corr.score.v) - df <- df[, c(2,4,5)] - -# saveRDS(oc.perc, '/Users/wenpinhou/Dropbox/trajectory_variability/tree_variability/result/samples/oc_percentage.rds') - -}) -names(df.alls) <- unique(alls) -df.alls[order(names(df.alls))] - - - diff --git a/hca_bone_marrow_data_analysis/tree_variability/auto_pc_auto_nclu_module/code/03_try_to_build_module.R b/hca_bone_marrow_data_analysis/tree_variability/auto_pc_auto_nclu_module/code/03_try_to_build_module.R deleted file mode 100644 index c822909..0000000 --- a/hca_bone_marrow_data_analysis/tree_variability/auto_pc_auto_nclu_module/code/03_try_to_build_module.R +++ /dev/null @@ -1,443 +0,0 @@ -rm(list=ls()) -library(ggplot2) -library(Seurat) -library(reshape2) -library(TSCAN) -library(scattermore) -library(RColorBrewer) -suppressMessages(library(igraph)) -n.permute <- 3 -max.clunum <- 50 -setwd("/Users/wenpinhou/Dropbox/trajectory_variability/hca/data/HCA/proc/integrate") - -# -------------------------------------------------------------- -# input: seurat integrated object including: -# umap, pca -# celltype: a dataframe, col 1 is cell name, col 2 is cell type -# origin: the origin cell type -# -------------------------------------------------------------- -# read in data -umap = readRDS('umap.rds') -pca <- as.matrix(umap@reductions$pca@cell.embeddings) -# ctlevel <- data.frame(ct=c('HSC','MPP','LMPP','CMP','CLP','GMP','MEP',"Bcell","CD4Tcell","CD8Tcell",'NKcell','Mono','Ery'),level=c(1,2,3,3,4,4,4,5,5,5,5,5,5),immunepath=c(1,1,1,0,1,0,0,1,1,1,1,0,0),monopath=c(1,1,1,1,0,1,0,0,0,0,0,1,0),erypath=c(1,1,0,1,0,0,1,0,0,0,0,0,1),stringsAsFactors = F) -str(pca) -a = readRDS('/Users/wenpinhou/Dropbox/trajectory_variability/hca/data/HCA/proc/ct/sc.rds') -ct = data.frame(cell = names(a), celltype = a, stringsAsFactors = FALSE) -alls <- sub(':.*', '', names(a)) -names(alls) <- names(a) - -mykmeans <- function(matrix, number.cluster = NA){ - ## cluster the rows - set.seed(12345) - library(parallel) - if (is.na(number.cluster)){ - maxclunum <- 20 - rss <- mclapply(1:maxclunum,function(clunum) { - tmp <- kmeans(matrix,clunum,iter.max = 1000) - tmp$betweenss/tmp$totss - },mc.cores=20) - rss <- unlist(rss) - x <- 1:maxclunum - optclunum <- which.min(sapply(1:maxclunum, function(i) { - x2 <- pmax(0, x - i) - sum(lm(rss ~ x + x2)$residuals^2) ## check this - })) - clu <- kmeans(matrix,optclunum) - } else { - clu <- kmeans(matrix, number.cluster) - } - return(clu) -} - -### determine numPC -set.seed(12345) -sdev <- apply(pca, 2, sd) -x <- 1:max.clunum -optpoint <- which.min(sapply(2:max.clunum, function(i) { - x2 <- pmax(0, x - i) - sum(lm(sdev[1:max.clunum] ~ x + x2)$residuals^2) -})) -pcadim = optpoint + 1 -pr <- pca[,1:pcadim] # 7 - -## clustering -clu <- mykmeans(pr, number.cluster = 14)$cluster -# pd = data.frame(x = pr[,1], y = pr[,2], clu = as.factor(clu[rownames(pr)])) -# mypalette = colorRampPalette(brewer.pal(9,'Set1')) -# ggplot(data = pd, aes(x = x, y = y, color = clu)) + -# geom_scattermore()+ -# scale_color_manual(values = mypalette(14))+ -# theme_classic() + xlab('UMAP1') + ylab('UMAP2') - -# ## cell type composition in clusters -# pd = cbind(pd, celltype = ct[match(rownames(pd), ct[,1]),2]) -# tab <- table(pd[,3:4]) -# tab <- tab/rowSums(tab) -# pd <- melt(tab) -# pd$clu <- factor(as.character(pd$clu), levels = seq(1,max(pd$clu))) -# -# ggplot(data = pd) + -# geom_bar(aes(x = clu, y = value, fill = celltype), stat = 'identity', position = 'dodge') + -# theme_classic() + -# ylab('Celltype Proportion') + -# scale_fill_manual(values = mypalette(length(unique(pd$celltype)))) - -### mclust -mcl <- exprmclust(t(pr),cluster=clu,reduce=F) -# mcl <- exprmclust(t(pr), reduce = F) -# plotmclust(mcl, cell_point_size = 0.1) -# str(mcl) - -# -------------------- -# construct pseudotime -# -------------------- -## find origin -pd = data.frame(x = pr[,1], y = pr[,2], clu = as.factor(mcl$clusterid)) -pd = cbind(pd, celltype = ct[match(rownames(pd), ct[,1]),2]) -tab <- table(pd[,3:4]) -tab <- tab/rowSums(tab) -pd <- melt(tab) -pd$clu <- factor(as.character(pd$clu), levels = seq(1,max(pd$clu))) -tmp <- pd[pd$celltype == 'HSC', ] -origin.cluster <- as.numeric(tmp[which.max(tmp[,3]), 1]) - -## construct pseudotime -ord <- TSCANorder(mcl, startcluster = origin.cluster, listbranch = T,orderonly = T) -str(ord) -length(ord) -pt <- unlist(sapply(sapply(ord, length), function(i) seq(1, i))) -names(pt) <- unname(unlist(ord)) - -# ## plot pseudotime -# pd = data.frame(pc1 = pca[,1], pc2 = pca[,2], time = as.numeric(pt[rownames(pca)])) -# library(scattermore) -# library(RColorBrewer) -# ggplot(data = pd, aes(x = pc1, y = pc2, color = time)) + -# geom_scattermore() + -# scale_color_gradient(low = 'yellow', high = 'blue') - -# ------------------------------------------------------------ -# get candidate branches to test reproducibility, 20200726 >> -# ------------------------------------------------------------ -findbranch <- function(mst, order, origin){ - deg <- degree(mst) - vertex <- names(deg[which(deg > 2 | deg == 1)]) - if (!origin %in% vertex) vertex <- c(origin, vertex) - eg <- expand.grid(1:length(vertex), 1:length(vertex)) - eg <- eg[eg[,1]0] - - allbranch <- gsub('backbone ', '', gsub('branch: ', '', names(order))) - allbranch <- sapply(allbranch, function(i) strsplit(i, ',')[[1]]) - allbranch <- paste0(names(allbranch), collapse = ' ') - newbranch <-sapply(tmpbranch, function(i) { - tmp <- paste0(i, collapse = ',') - if (!grepl(tmp, allbranch)){ - rev(i) - } else { - i - } - }) - return(newbranch) -} -newbranch <- findbranch(mst = mcl$MSTtree, order = ord, origin = origin.cluster) - -# ----------------------------------------------------- -# Evaluate robustness of tree branches using resampling -# ----------------------------------------------------- - -# null distribution of Jaccard index, overlap coefficient - -js.null <- lapply(seq(1, length(newbranch)), function(i) { - b.ori <- unlist(sapply(newbranch[[i]], function(c) names(mcl$clusterid[mcl$clusterid == c]))) - tmp <- sapply(seq(1, 1e3), function(j){ - set.seed(j) - b.pm <- sample(rownames(pr), length(b.ori)) - length(intersect(b.pm, b.ori))/length(union(b.pm, b.ori)) - }) -}) - -# par(mfrow = c(2,ceiling(length(js.null)/2))) -# for (i in js.null) hist(i, xlab = 'js', main = '', breaks = 50) - -js.cut <- sapply(js.null, quantile, 0.99) - -oc.null <- lapply(seq(1, length(newbranch)), function(i){ - b.ori <- unlist(sapply(newbranch[[i]], function(c) names(mcl$clusterid[mcl$clusterid == c]))) - tmp <- sapply(seq(1, 1e3), function(j){ - set.seed(j) - b.pm <- sample(rownames(pr), length(b.ori)) - length(intersect(b.pm, b.ori))/min(length(b.pm), length(b.ori)) - }) -}) -# par(mfrow = c(2,ceiling(length(oc.null)/2))) -# for (i in oc.null) hist(i, xlab = 'oc', main = '', breaks = 50) -oc.cut <- sapply(oc.null, quantile, 0.99) - -# permutation - -get_binary <- function(js){ - js.binary <- sapply(seq(1,ncol(js)), function(c){ - (js[,c] > js.cut[c]) + 0 - }) - while (length(which(rowSums(js.binary) > 1)) > 0 | length(which(colSums(js.binary) > 1)) > 0){ - dup.id <- which(rowSums(js.binary) > 1) - if (length(dup.id) == 1){ - addid <- which.max(js[dup.id, ]) - js.binary[dup.id, ] <- 0 - js.binary[dup.id, addid] <- 1 - } else if (length(dup.id) > 1) { - for (dup.i in dup.id){ - addid <- which.max(js[dup.i, ]) - js.binary[dup.i, ] <- 0 - js.binary[dup.i, addid] <- 1 - } - } - - dup.id <- which(colSums(js.binary) > 1) - if (length(dup.id) == 1){ - addid <- which.max(js[, dup.id]) - js.binary[, dup.id] <- 0 - js.binary[addid, dup.id] <- 1 - } else if (length(dup.id) > 1) { - for (dup.i in dup.id){ - addid <- which.max(js[, dup.i]) - js.binary[, dup.i] <- 0 - js.binary[addid, dup.i] <- 1 - } - } - } - return(js.binary) -} - -ctcomplist <- reproduce <- corr.score <- corrlist <- jslist <- oclist <- list() - -for (pmid in seq(1, n.permute)){ - ## boostrap cells - print(pmid) - set.seed(pmid) - pr.pm <- pr[sample(rownames(pr), nrow(pr), replace = TRUE),] - pr.pm <- pr.pm[!duplicated(rownames(pr.pm)),] - - ## cluster cells - clu <- mykmeans(pr.pm, number.cluster = 14)$cluster ### - - # --- check if these codes are necessary <<<<<<<<<<<<<<<< - pd = data.frame(x = pr[names(clu),1], y = pr[names(clu),2], clu = as.factor(clu)) - pd.text.x = tapply(pd[,1], list(pd$clu), mean) - pd.text.y = tapply(pd[,2], list(pd$clu), mean) - pd.text = data.frame(x = pd.text.x, y = pd.text.y, clu = names(pd.text.x)) - pd.text[14,1:2] = c(pd.text[14,1] + 2, pd.text[14,2] + 1) - # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> - # ggplot() + - # geom_scattermore(data = pd, aes(x = x, y = y, color = clu))+ - # scale_color_manual(values = mypalette(14))+ - # theme_classic() + xlab('UMAP1') + ylab('UMAP2') + - # geom_text(data = pd.text, aes(x = x, y = y, label = clu)) - - - ## cell type composition in clusters - # pd = cbind(pd, celltype = ct[match(rownames(pd), ct[,1]),2]) - # tab <- table(pd[,3:4]) - # tab <- tab/rowSums(tab) - # pd <- melt(tab) - # pd$clu <- factor(as.character(pd$clu), levels = seq(1,max(pd$clu))) - # ggplot(data = pd) + - # geom_bar(aes(x = clu, y = value, fill = celltype), stat = 'identity', position = 'dodge') + - # theme_classic() + - # ylab('Celltype Proportion') + - # scale_fill_manual(values = mypalette(length(unique(pd$celltype)))) - - ## build pseudotime - mcl.pm <- exprmclust(t(pr.pm), cluster = clu, reduce = FALSE) ### - # plotmclust(mcl.pm, cell_point_size = 0.1) - - ## select origin cluster - pt.pm.mean<- tapply(pt[names(mcl.pm[['clusterid']])], list(mcl.pm[['clusterid']]), mean) - start.cluster <- names(which.min(pt.pm.mean)) - - ## construct pseudotime - ord.pm <- TSCANorder(mcl.pm, startcluster = start.cluster, listbranch = T,orderonly = T) - # str(ord.pm) - - pt.pm <- unlist(sapply(sapply(ord.pm, length), function(i) seq(1, i))) - names(pt.pm) <- unname(unlist(ord.pm)) - # --- check if these codes are necessary <<<<<<<<<<<<<<<< - ## plot pseudotime - - pd = data.frame(pc1 = pca[,1], pc2 = pca[,2], time = as.numeric(pt.pm[rownames(pca)])) - # ggplot(data = pd, aes(x = pc1, y = pc2, color = time)) + - # geom_scattermore() + theme_classic() - # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> - - # get candidate branches - newbranch.pm <- findbranch(mst = mcl.pm$MSTtree, order = ord.pm, origin = start.cluster) - - ## compare two MST - js <- sapply(seq(1, length(newbranch)), function(i){ - id <- which(sapply(paste0(names(ord),','), function(k) grepl(paste0(paste0(newbranch[[i]], collapse = ','),','), k)))[1] - cells <- ord[[id]] - b.ori <- intersect(unlist(sapply(newbranch[[i]], function(k) names(mcl$clusterid)[mcl$clusterid == k])), cells) - sapply(seq(1, length(newbranch.pm)), function(j){ - print(j) - id <- which(sapply(paste0(names(ord.pm),','), function(k) grepl(paste0(paste0(newbranch.pm[[j]], collapse = ','),','), k)))[1] - cells <- ord.pm[[id]] - b.pm <- intersect(unlist(sapply(newbranch.pm[[j]], function(k) names(mcl.pm$clusterid)[mcl.pm$clusterid == k])), cells) - js <- length(intersect(b.pm, b.ori))/length(union(b.pm, b.ori)) - }) - }) - oc <- sapply(seq(1, length(newbranch)), function(i){ - id <- which(sapply(paste0(names(ord),','), function(k) grepl(paste0(paste0(newbranch[[i]], collapse = ','),','), k)))[1] - cells <- ord[[id]] - b.ori <- intersect(unlist(sapply(newbranch[[i]], function(k) names(mcl$clusterid)[mcl$clusterid == k])), cells) - sapply(seq(1, length(newbranch.pm)), function(j){ - id <- which(sapply(paste0(names(ord.pm),','), function(k) grepl(paste0(paste0(newbranch.pm[[j]], collapse = ','),','), k)))[1] - cells <- ord.pm[[id]] - b.pm <- intersect(unlist(sapply(newbranch.pm[[j]], function(k) names(mcl.pm$clusterid)[mcl.pm$clusterid == k])), cells) - oc <- length(intersect(b.pm, b.ori))/min(length(b.pm), length(b.ori)) - }) - }) - corr <- sapply(seq(1, length(newbranch)), function(i){ - id <- which(sapply(paste0(names(ord),','), function(k) grepl(paste0(paste0(newbranch[[i]], collapse = ','),','), k)))[1] - cells <- ord[[id]] - b.ori <- intersect(unlist(sapply(newbranch[[i]], function(k) names(mcl$clusterid)[mcl$clusterid == k])), cells) - - sapply(seq(1, length(newbranch.pm)), function(j){ - id <- which(sapply(paste0(names(ord.pm),','), function(k) grepl(paste0(paste0(newbranch.pm[[j]], collapse = ','),','), k)))[1] - cells <- ord.pm[[id]] - b.pm <- intersect(unlist(sapply(newbranch.pm[[j]], function(k) names(mcl.pm$clusterid)[mcl.pm$clusterid == k])), cells) - ov = intersect(b.ori, b.pm) - cor(pt[ov], pt.pm[ov]) - }) - }) - corr[is.na(corr)] <- 0 - colnames(corr) <- colnames(oc) <- colnames(js) <- paste0('original', seq(1, length(newbranch))) - jslist[[pmid]] <- js - oclist[[pmid]] <- oc - corrlist[[pmid]] <- corr - - ## get js binary to matched branches <<<<<<<<<<<<<<< - js.binary <- get_binary(js) - corr.score[[pmid]] <- corr * js.binary - js.melt <- melt(js.binary) - js.melt <- js.melt[js.melt[,3]!=0,] - colnames(js.melt) <- c('permutation.branch','original.branch','matched') - reproduce[[pmid]] <- as.character(js.melt[,2]) - ## >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> - ## - tmp <- js.melt[1,2] - ctcomp <- sapply(js.melt[,2], function(tmp){ - c <- names(clu)[clu %in% newbranch.pm[[tmp]]] - ctcomp <- rep(0, length(unique(alls))) - names(ctcomp) <- unique(alls) - ctcomp[names(table(alls[c]))] <- table(alls[c]) - }) - colnames(ctcomp) <- paste0('origin', js.melt[,2]) - ctcomp <- ctcomp/rowSums(ctcomp) - - - ctcomp.new <- matrix(0, nrow = length(unique(alls)), ncol = length(newbranch)) - colnames(ctcomp.new) <- paste0('origin', seq(1, length(newbranch))) - rownames(ctcomp.new) <- unique(alls) - ctcomp.new[rownames(ctcomp), colnames(ctcomp)] <- ctcomp - ctcomplist[[pmid]] <- t(ctcomp.new) - -} - -# saveRDS(jslist, '/Users/wenpinhou/Dropbox/trajectory_variability/tree_variability/result/pm_js.rds') -# saveRDS(oclist, '/Users/wenpinhou/Dropbox/trajectory_variability/tree_variability/result/pm_oc.rds') -# -# saveRDS(corrlist, '/Users/wenpinhou/Dropbox/trajectory_variability/tree_variability/result/pm_oc.rds') - -jsm <- do.call(rbind, jslist) -ocm <- do.call(rbind, oclist) -# par(mfrow = c(1,2)) -# hist(jsm) -# hist(ocm) - -## moved within boostrap -# reproduce <- corr.score <- list() -# for (i in seq(1, length(jslist))){ -# print(i) -# js <- jslist[[i]] -# js.binary <- get_binary(js) -# corr.score[[i]] <- corrlist[[i]] * js.binary -# js.melt <- melt(js.binary) -# js.melt <- js.melt[js.melt[,3]!=0,] -# colnames(js.melt) <- c('permutation.branch','original.branch','matched') -# reproduce[[i]] <- as.character(js.melt[,2]) -# } - -reproduce <- unlist(reproduce) - - -js.perc <- rep(0, length(newbranch)) -js.perc[as.numeric(names(table(reproduce)))] <- table(reproduce)/n.permute -names(js.perc) <- newbranch -# saveRDS(js.perc, '/Users/wenpinhou/Dropbox/trajectory_variability/tree_variability/result/js_percentage.rds') - -corr.score.m <- do.call(rbind, corr.score) -corr.score.v <- colSums(corr.score.m)/n.permute -names(corr.score.v) <- newbranch -# saveRDS(corr.score.v, '/Users/wenpinhou/Dropbox/trajectory_variability/tree_variability/result/corr_score.rds') - -res <- sapply(seq(1,length(oclist)), function(i){ - print(i) - oc <- oclist[[i]] - oc.binary <- sapply(seq(1,ncol(oc)), function(c){ - (oc[,c] > oc.cut[c]) + 0 - }) - while (length(which(rowSums(oc.binary) > 1)) > 0 | length(which(colSums(oc.binary) > 1)) > 0){ - dup.id <- which(rowSums(oc.binary) > 1) - if (length(dup.id) == 1){ - addid <- which.max(oc[dup.id, ]) - oc.binary[dup.id, ] <- 0 - oc.binary[dup.id, addid] <- 1 - } else if (length(dup.id) > 1) { - for (dup.i in dup.id){ - addid <- which.max(oc[dup.i, ]) - oc.binary[dup.i, ] <- 0 - oc.binary[dup.i, addid] <- 1 - } - } - dup.id <- which(colSums(oc.binary) > 1) - if (length(dup.id) == 1){ - addid <- which.max(oc[, dup.id]) - oc.binary[, dup.id] <- 0 - oc.binary[addid, dup.id] <- 1 - } else if (length(dup.id) > 1) { - for (dup.i in dup.id){ - addid <- which.max(oc[, dup.i]) - oc.binary[, dup.i] <- 0 - oc.binary[addid, dup.i] <- 1 - } - } - } - oc.melt <- melt(oc.binary) - oc.melt <- oc.melt[oc.melt[,3]!=0,] - as.character(oc.melt[,2]) -}) -res <- unlist(res) -oc.perc <- rep(0, length(newbranch)) -oc.perc[as.numeric(names(table(res)))] <- table(res)/n.permute -names(oc.perc) <- newbranch -sort((js.perc + oc.perc)/2) - -# saveRDS(oc.perc, '/Users/wenpinhou/Dropbox/trajectory_variability/tree_variability/result/oc_percentage.rds') - -detection.rate <- data.frame(detection.rate = (js.perc + oc.perc[names(js.perc)])/2, stringsAsFactors = FALSE) -sample.cellcomp.mean <- apply(simplify2array(ctcomplist), 1:2, mean) -sample.cellcomp.sd <- apply(simplify2array(ctcomplist), 1:2, sd) -rownames(sample.cellcomp.mean) <- newbranch[as.numeric(sub('origin', '', rownames(sample.cellcomp.mean)))] -rownames(sample.cellcomp.sd) <- newbranch[as.numeric(sub('origin', '', rownames(sample.cellcomp.sd)))] - -result <- list(detection.rate = detection.rate, - sample.cellcomp.mean = sample.cellcomp.mean, - sample.cellcomp.sd = sample.cellcomp.sd) - diff --git a/hca_bone_marrow_data_analysis/tree_variability/auto_pc_auto_nclu_module/code/04_try_to_build_module_v2.R b/hca_bone_marrow_data_analysis/tree_variability/auto_pc_auto_nclu_module/code/04_try_to_build_module_v2.R deleted file mode 100644 index e6d73c3..0000000 --- a/hca_bone_marrow_data_analysis/tree_variability/auto_pc_auto_nclu_module/code/04_try_to_build_module_v2.R +++ /dev/null @@ -1,408 +0,0 @@ -rm(list=ls()) -library(ggplot2) -library(Seurat) -library(reshape2) -library(TSCAN) -library(scattermore) -library(RColorBrewer) -suppressMessages(library(igraph)) -n.permute <- 3 -max.clunum <- 50 -setwd("/Users/wenpinhou/Dropbox/trajectory_variability/hca/data/HCA/proc/integrate") -plotdir <- '/Users/wenpinhou/Dropbox/trajectory_variability/tree_variability/plot/' -# -------------------------------------------------------------- -# input: seurat integrated object including: -# umap, pca -# celltype: a dataframe, col 1 is cell name, col 2 is cell type -# origin: the origin cell type -# -------------------------------------------------------------- -# read in data -umap = readRDS('umap.rds') -pca <- as.matrix(umap@reductions$pca@cell.embeddings) -# ctlevel <- data.frame(ct=c('HSC','MPP','LMPP','CMP','CLP','GMP','MEP',"Bcell","CD4Tcell","CD8Tcell",'NKcell','Mono','Ery'),level=c(1,2,3,3,4,4,4,5,5,5,5,5,5),immunepath=c(1,1,1,0,1,0,0,1,1,1,1,0,0),monopath=c(1,1,1,1,0,1,0,0,0,0,0,1,0),erypath=c(1,1,0,1,0,0,1,0,0,0,0,0,1),stringsAsFactors = F) -str(pca) -a = readRDS('/Users/wenpinhou/Dropbox/trajectory_variability/hca/data/HCA/proc/ct/sc.rds') -ct = data.frame(cell = names(a), celltype = a, stringsAsFactors = FALSE) - -mykmeans <- function(matrix, number.cluster = NA, maxclunum = 20, seed = 12345){ - ## cluster the rows - set.seed(seed) - library(parallel) - if (is.na(number.cluster)){ - rss <- mclapply(1:maxclunum,function(clunum) { - set.seed(12345) - tmp <- kmeans(matrix,clunum,iter.max = 1000) - tmp$betweenss/tmp$totss - },mc.cores=20) - rss <- unlist(rss) - x <- 1:maxclunum - optclunum <- which.min(sapply(1:maxclunum, function(i) { - x2 <- pmax(0, x - i) - sum(lm(rss ~ x + x2)$residuals^2) ## check this - })) - print(optclunum) - clu <- kmeans(matrix,optclunum) - } else { - clu <- kmeans(matrix, number.cluster) - } - return(clu) -} -findbranch <- function(mst, order, origin){ - deg <- degree(mst) - vertex <- names(deg[which(deg > 2 | deg == 1)]) - if (!origin %in% vertex) vertex <- c(origin, vertex) - eg <- expand.grid(1:length(vertex), 1:length(vertex)) - eg <- eg[eg[,1]0] - - allbranch <- gsub('backbone ', '', gsub('branch: ', '', names(order))) - allbranch <- sapply(allbranch, function(i) strsplit(i, ',')[[1]]) - allbranch <- paste0(names(allbranch), collapse = ' ') - newbranch <-sapply(tmpbranch, function(i) { - tmp <- paste0(i, collapse = ',') - if (!grepl(tmp, allbranch)){ - rev(i) - } else { - i - } - }) - return(newbranch) -} -get_binary <- function(matrix, matrix.cut){ - ## match boostrap and origin branches. - ## matrix: #boostrap.branch * #origin.branch, values are js or oc - ## matrix.cut: js or oc null distribution cutoff - matrix.binary <- sapply(seq(1,ncol(matrix)), function(c){ - (matrix[,c] > matrix.cut[c]) + 0 - }) - while (length(which(rowSums(matrix.binary) > 1)) > 0 | length(which(colSums(matrix.binary) > 1)) > 0){ - dup.id <- which(rowSums(matrix.binary) > 1) - if (length(dup.id) == 1){ - addid <- which.max(matrix[dup.id, ]) - matrix.binary[dup.id, ] <- 0 - matrix.binary[dup.id, addid] <- 1 - } else if (length(dup.id) > 1) { - for (dup.i in dup.id){ - addid <- which.max(matrix[dup.i, ]) - matrix.binary[dup.i, ] <- 0 - matrix.binary[dup.i, addid] <- 1 - } - } - - dup.id <- which(colSums(matrix.binary) > 1) - if (length(dup.id) == 1){ - addid <- which.max(matrix[, dup.id]) - matrix.binary[, dup.id] <- 0 - matrix.binary[addid, dup.id] <- 1 - } else if (length(dup.id) > 1) { - for (dup.i in dup.id){ - addid <- which.max(matrix[, dup.i]) - matrix.binary[, dup.i] <- 0 - matrix.binary[addid, dup.i] <- 1 - } - } - } - return(matrix.binary) -} -infer_tree_structure <- function(pca, ct, origin.celltype, number.cluster = NA, plotdir = getwd()){ - alls <- sub(':.*', '', ct$cell) - names(alls) <- ct$cell - set.seed(12345) - sdev <- apply(pca, 2, sd) - x <- 1:max.clunum - optpoint <- which.min(sapply(2:max.clunum, function(i) { - x2 <- pmax(0, x - i) - sum(lm(sdev[1:max.clunum] ~ x + x2)$residuals^2) - })) - pcadim = optpoint + 1 - pr <- pca[,1:pcadim] # 7 - - ## clustering - # clu <- mykmeans(pr, number.cluster = number.cluster, maxclunum = 50, seed = i)$cluster - clu <- mykmeans(pr, maxclunum = 50, number.cluster = number.cluster)$cluster - table(clu) - pd = data.frame(x = pr[,1], y = pr[,2], clu = as.factor(clu[rownames(pr)])) - mypalette = colorRampPalette(brewer.pal(9,'Set1')) - pdf(paste0(plotdir, 'cluster.pdf'), width = 5, height = 4) - print(ggplot(data = pd, aes(x = x, y = y, color = clu)) + - geom_scattermore()+ - scale_color_manual(values = mypalette(14))+ - theme_classic() + xlab('PC1') + ylab('PC2')) - dev.off() - ## cell type composition in clusters - pd = cbind(pd, celltype = ct[match(rownames(pd), ct[,1]),2]) - tab <- table(pd[,3:4]) - tab <- tab/rowSums(tab) - pd <- melt(tab) - pd$clu <- factor(as.character(pd$clu), levels = seq(1,max(pd$clu))) - pdf(paste0(plotdir, 'celltype_composition_for_cluster.pdf'), width = 9, height = 5) - print(ggplot(data = pd) + - geom_bar(aes(x = clu, y = value, fill = celltype), stat = 'identity', position = 'dodge') + - theme_classic() + - ylab('Celltype Proportion') + - scale_fill_manual(values = mypalette(length(unique(pd$celltype))))) - dev.off() - ### mclust - mcl <- exprmclust(t(pr),cluster=clu,reduce=F) - # mcl <- exprmclust(t(pr), reduce = F) - pdf(paste0(plotdir, 'mcl.pdf'), width=8,height=8) - print(plotmclust(mcl, cell_point_size = 0.1)) - dev.off() - - # str(mcl) - # - # -------------------- - # construct pseudotime - # -------------------- - ## find origin - pd = data.frame(x = pr[,1], y = pr[,2], clu = as.factor(mcl$clusterid)) - pd = cbind(pd, celltype = ct[match(rownames(pd), ct[,1]),2]) - tab <- table(pd[,3:4]) - tab <- tab/rowSums(tab) - pd <- melt(tab) - pd$clu <- factor(as.character(pd$clu), levels = seq(1,max(pd$clu))) - tmp <- pd[pd$celltype == origin.celltype, ] - origin.cluster <- as.numeric(tmp[which.max(tmp[,3]), 1]) - - ## construct pseudotime - ord <- TSCANorder(mcl, startcluster = origin.cluster, listbranch = T,orderonly = T) - str(ord) - length(ord) - pt <- unlist(sapply(sapply(ord, length), function(i) seq(1, i))) - names(pt) <- unname(unlist(ord)) - - # ## plot pseudotime - pd = data.frame(pc1 = pca[,1], pc2 = pca[,2], time = as.numeric(pt[rownames(pca)])) - library(scattermore) - library(RColorBrewer) - pdf(paste0(plotdir, 'pseudotime.pdf'), width = 7, height = 6) - print(ggplot(data = pd, aes(x = pc1, y = pc2, color = time)) + - geom_scattermore() + - scale_color_gradient(low = 'yellow', high = 'blue')) - dev.off() - # ------------------------------------------------------------ - # get candidate branches to test reproducibility, 20200726 >> - # ------------------------------------------------------------ - - newbranch <- findbranch(mst = mcl$MSTtree, order = ord, origin = origin.cluster) - - # ----------------------------------------------------- - # Evaluate robustness of tree branches using resampling - # ----------------------------------------------------- - - # null distribution of Jaccard index, overlap coefficient - - js.null <- lapply(seq(1, length(newbranch)), function(i) { - b.ori <- unlist(sapply(newbranch[[i]], function(c) names(mcl$clusterid[mcl$clusterid == c]))) - tmp <- sapply(seq(1, 1e3), function(j){ - set.seed(j) - b.pm <- sample(rownames(pr), length(b.ori)) - length(intersect(b.pm, b.ori))/length(union(b.pm, b.ori)) - }) - }) - - # par(mfrow = c(2,ceiling(length(js.null)/2))) - # for (i in js.null) hist(i, xlab = 'js', main = '', breaks = 50) - - js.cut <- sapply(js.null, quantile, 0.99) - - oc.null <- lapply(seq(1, length(newbranch)), function(i){ - b.ori <- unlist(sapply(newbranch[[i]], function(c) names(mcl$clusterid[mcl$clusterid == c]))) - tmp <- sapply(seq(1, 1e3), function(j){ - set.seed(j) - b.pm <- sample(rownames(pr), length(b.ori)) - length(intersect(b.pm, b.ori))/min(length(b.pm), length(b.ori)) - }) - }) - # par(mfrow = c(2,ceiling(length(oc.null)/2))) - # for (i in oc.null) hist(i, xlab = 'oc', main = '', breaks = 50) - oc.cut <- sapply(oc.null, quantile, 0.99) - - mcl$pseudotime <- pt - mcl$branch <- newbranch - mcl$js.cut <- js.cut - mcl$oc.cut <- oc.cut - mcl$pca <- pr - mcl$order <- ord - mcl$allsample <- alls - return(mcl) -} -evaluate_uncertainty <- function(inferobj, n.permute){ - pr <- inferobj$pca - newbranch <- inferobj$branch - js.cut <- inferobj$js.cut - oc.cut <- inferobj$oc.cut - pt <- inferobj$pseudotime - ord <- inferobj$order - alls <- inferobj$allsample - ctcomplist <- reproduce.js <- reproduce.oc <- corr.score <- list() - for (pmid in seq(1, n.permute)){ - print(pmid) - ## boostrap cells - set.seed(pmid) - pr.pm <- pr[sample(rownames(pr), nrow(pr), replace = TRUE),] - pr.pm <- pr.pm[!duplicated(rownames(pr.pm)),] - - ## cluster cells - clu <- mykmeans(pr.pm, number.cluster = 14)$cluster ### - - # --- check if these codes are necessary <<<<<<<<<<<<<<<< - # pd = data.frame(x = pr[names(clu),1], y = pr[names(clu),2], clu = as.factor(clu)) - # pd.text.x = tapply(pd[,1], list(pd$clu), mean) - # pd.text.y = tapply(pd[,2], list(pd$clu), mean) - # pd.text = data.frame(x = pd.text.x, y = pd.text.y, clu = names(pd.text.x)) - # pd.text[14,1:2] = c(pd.text[14,1] + 2, pd.text[14,2] + 1) - # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> - # ggplot() + - # geom_scattermore(data = pd, aes(x = x, y = y, color = clu))+ - # scale_color_manual(values = mypalette(14))+ - # theme_classic() + xlab('UMAP1') + ylab('UMAP2') + - # geom_text(data = pd.text, aes(x = x, y = y, label = clu)) - - - ## cell type composition in clusters - # pd = cbind(pd, celltype = ct[match(rownames(pd), ct[,1]),2]) - # tab <- table(pd[,3:4]) - # tab <- tab/rowSums(tab) - # pd <- melt(tab) - # pd$clu <- factor(as.character(pd$clu), levels = seq(1,max(pd$clu))) - # ggplot(data = pd) + - # geom_bar(aes(x = clu, y = value, fill = celltype), stat = 'identity', position = 'dodge') + - # theme_classic() + - # ylab('Celltype Proportion') + - # scale_fill_manual(values = mypalette(length(unique(pd$celltype)))) - - ## build pseudotime - mcl.pm <- exprmclust(t(pr.pm), cluster = clu, reduce = FALSE) ### - # plotmclust(mcl.pm, cell_point_size = 0.1) - - ## select origin cluster - pt.pm.mean<- tapply(pt[names(mcl.pm[['clusterid']])], list(mcl.pm[['clusterid']]), mean) - start.cluster <- names(which.min(pt.pm.mean)) - - ## construct pseudotime - ord.pm <- TSCANorder(mcl.pm, startcluster = start.cluster, listbranch = T,orderonly = T) - # str(ord.pm) - - pt.pm <- unlist(sapply(sapply(ord.pm, length), function(i) seq(1, i))) - names(pt.pm) <- unname(unlist(ord.pm)) - # --- check if these codes are necessary <<<<<<<<<<<<<<<< - ## plot pseudotime - - pd = data.frame(pc1 = pca[,1], pc2 = pca[,2], time = as.numeric(pt.pm[rownames(pca)])) - # ggplot(data = pd, aes(x = pc1, y = pc2, color = time)) + - # geom_scattermore() + theme_classic() - # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> - - # get candidate branches - newbranch.pm <- findbranch(mst = mcl.pm$MSTtree, order = ord.pm, origin = start.cluster) - - ## compare two MST - js <- sapply(seq(1, length(newbranch)), function(i){ - id <- which(sapply(paste0(names(ord),','), function(k) grepl(paste0(paste0(newbranch[[i]], collapse = ','),','), k)))[1] - cells <- ord[[id]] - b.ori <- intersect(unlist(sapply(newbranch[[i]], function(k) names(inferobj$clusterid)[inferobj$clusterid == k])), cells) - sapply(seq(1, length(newbranch.pm)), function(j){ - - id <- which(sapply(paste0(names(ord.pm),','), function(k) grepl(paste0(paste0(newbranch.pm[[j]], collapse = ','),','), k)))[1] - cells <- ord.pm[[id]] - b.pm <- intersect(unlist(sapply(newbranch.pm[[j]], function(k) names(mcl.pm$clusterid)[mcl.pm$clusterid == k])), cells) - js <- length(intersect(b.pm, b.ori))/length(union(b.pm, b.ori)) - }) - }) - oc <- sapply(seq(1, length(newbranch)), function(i){ - id <- which(sapply(paste0(names(ord),','), function(k) grepl(paste0(paste0(newbranch[[i]], collapse = ','),','), k)))[1] - cells <- ord[[id]] - b.ori <- intersect(unlist(sapply(newbranch[[i]], function(k) names(inferobj$clusterid)[inferobj$clusterid == k])), cells) - sapply(seq(1, length(newbranch.pm)), function(j){ - id <- which(sapply(paste0(names(ord.pm),','), function(k) grepl(paste0(paste0(newbranch.pm[[j]], collapse = ','),','), k)))[1] - cells <- ord.pm[[id]] - b.pm <- intersect(unlist(sapply(newbranch.pm[[j]], function(k) names(mcl.pm$clusterid)[mcl.pm$clusterid == k])), cells) - oc <- length(intersect(b.pm, b.ori))/min(length(b.pm), length(b.ori)) - }) - }) - corr <- sapply(seq(1, length(newbranch)), function(i){ - id <- which(sapply(paste0(names(ord),','), function(k) grepl(paste0(paste0(newbranch[[i]], collapse = ','),','), k)))[1] - cells <- ord[[id]] - b.ori <- intersect(unlist(sapply(newbranch[[i]], function(k) names(inferobj$clusterid)[inferobj$clusterid == k])), cells) - - sapply(seq(1, length(newbranch.pm)), function(j){ - id <- which(sapply(paste0(names(ord.pm),','), function(k) grepl(paste0(paste0(newbranch.pm[[j]], collapse = ','),','), k)))[1] - cells <- ord.pm[[id]] - b.pm <- intersect(unlist(sapply(newbranch.pm[[j]], function(k) names(mcl.pm$clusterid)[mcl.pm$clusterid == k])), cells) - ov = intersect(b.ori, b.pm) - cor(pt[ov], pt.pm[ov]) - }) - }) - corr[is.na(corr)] <- 0 - colnames(corr) <- colnames(oc) <- colnames(js) <- paste0('original', seq(1, length(newbranch))) - - ## get js binary to match branches - js.binary <- get_binary(js, js.cut) - corr.score[[pmid]] <- corr * js.binary - js.melt <- melt(js.binary) - js.melt <- js.melt[js.melt[,3]!=0,] - colnames(js.melt) <- c('permutation.branch','original.branch','matched') - reproduce.js[[pmid]] <- as.character(js.melt[,2]) - - ## get oc binary to match branches - oc.binary <- get_binary(oc, oc.cut) - oc.melt <- melt(oc.binary) - oc.melt <- oc.melt[oc.melt[,3]!=0,] - reproduce.oc[[pmid]] <- as.character(oc.melt[,2]) - - ## samples cell compositions - ctcomp <- sapply(js.melt[,2], function(tmp){ - c <- names(clu)[clu %in% newbranch.pm[[tmp]]] - ctcomp <- rep(0, length(unique(alls))) - names(ctcomp) <- unique(alls) - ctcomp[names(table(alls[c]))] <- table(alls[c]) - }) - colnames(ctcomp) <- paste0('origin', js.melt[,2]) - ctcomp <- ctcomp/rowSums(ctcomp) - - ctcomp.new <- matrix(0, nrow = length(unique(alls)), ncol = length(newbranch)) - colnames(ctcomp.new) <- paste0('origin', seq(1, length(newbranch))) - rownames(ctcomp.new) <- unique(alls) - ctcomp.new[rownames(ctcomp), colnames(ctcomp)] <- ctcomp - ctcomplist[[pmid]] <- t(ctcomp.new) - } - - reproduce.js <- unlist(reproduce.js) - js.perc <- rep(0, length(newbranch)) - js.perc[as.numeric(names(table(reproduce.js)))] <- table(reproduce.js)/n.permute - names(js.perc) <- newbranch - - reproduce.oc <- unlist(reproduce.oc) - oc.perc <- rep(0, length(newbranch)) - oc.perc[as.numeric(names(table(reproduce.oc)))] <- table(reproduce.oc)/n.permute - names(oc.perc) <- newbranch - - corr.score.m <- do.call(rbind, corr.score) - corr.score.v <- colSums(corr.score.m)/n.permute - names(corr.score.v) <- newbranch - - sort((js.perc + oc.perc)/2) - - detection.rate <- data.frame(detection.rate = (js.perc + oc.perc[names(js.perc)])/2, stringsAsFactors = FALSE) - sample.cellcomp.mean <- apply(simplify2array(ctcomplist), 1:2, mean) - sample.cellcomp.sd <- apply(simplify2array(ctcomplist), 1:2, sd) - rownames(sample.cellcomp.mean) <- newbranch[as.numeric(sub('origin', '', rownames(sample.cellcomp.mean)))] - rownames(sample.cellcomp.sd) <- newbranch[as.numeric(sub('origin', '', rownames(sample.cellcomp.sd)))] - - result <- list(detection.rate = detection.rate, - sample.cellcomp.mean = sample.cellcomp.mean, - sample.cellcomp.sd = sample.cellcomp.sd) - return(result) -} - -# permutation -a = infer_tree_structure(pca = pca, ct = ct, origin.celltype = 'HSC', plotdir = plotdir) -result <- evaluate_uncertainty(a, 100) -saveRDS(result, '/Users/wenpinhou/Dropbox/trajectory_variability/tree_variability/result/result.rds') diff --git a/hca_bone_marrow_data_analysis/tree_variability/auto_pc_auto_nclu_module/code/05_try_to_build_module_v5.R b/hca_bone_marrow_data_analysis/tree_variability/auto_pc_auto_nclu_module/code/05_try_to_build_module_v5.R deleted file mode 100644 index f0c34b4..0000000 --- a/hca_bone_marrow_data_analysis/tree_variability/auto_pc_auto_nclu_module/code/05_try_to_build_module_v5.R +++ /dev/null @@ -1,38 +0,0 @@ -rm(list=ls()) -library(ggplot2) -library(Seurat) -library(reshape2) -library(TSCAN) -library(scattermore) -library(RColorBrewer) -suppressMessages(library(igraph)) -n.permute <- 3 -max.clunum <- 50 -source("/Users/wenpinhou/Dropbox/trajectory_variability/function/01_function.R") -plotdir <- '/Users/wenpinhou/Dropbox/trajectory_variability/tree_variability/auto_pc_auto_nclu_module/plot/' -rdir <- '/Users/wenpinhou/Dropbox/trajectory_variability/tree_variability/auto_pc_auto_nclu_module/result/' -# -------------------------------------------------------------- -# input: seurat integrated object including: -# low dim reduction: umap, pca, or phate -# celltype: a dataframe, col 1 is cell name, col 2 is cell type (at least for the cells with origin cell type), col 3 is sample name -# origin: the origin cell type -# -------------------------------------------------------------- -# read in data -umap = readRDS('umap.rds') -pca <- as.matrix(umap@reductions$pca@cell.embeddings) -a = readRDS('/Users/wenpinhou/Dropbox/trajectory_variability/hca/data/HCA/proc/ct/sc.rds') -ct = data.frame(cell = names(a), celltype = a, sample = sapply(names(a), function(i) sub(':.*', '', i)), stringsAsFactors = FALSE) - -# permutation -a = infer_tree_structure(pca = pca, ct = ct, origin.celltype = 'HSC', plotdir = plotdir, xlab='Principal Component1', ylab = 'Principal Component 2') -pdf(paste0(plotdir, 'mcl.pdf'), width=6,height=5) -print(plotmclust(a, cell_point_size = 0.1, x.lab = 'PC1', y.lab = 'PC2')) -dev.off() -result <- evaluate_uncertainty(a, 100) -saveRDS(result, paste0(rdir, 'result.rds')) - -for (i in names(result)){ - write.csv(result[[i]], paste0(rdir, i, '.csv'), row.names = T) -} - - diff --git a/hca_bone_marrow_data_analysis/tree_variability/auto_pc_auto_nclu_module_3traj/code/01_reproducibility.R b/hca_bone_marrow_data_analysis/tree_variability/auto_pc_auto_nclu_module_3traj/code/01_reproducibility.R deleted file mode 100644 index 6a0319e..0000000 --- a/hca_bone_marrow_data_analysis/tree_variability/auto_pc_auto_nclu_module_3traj/code/01_reproducibility.R +++ /dev/null @@ -1,274 +0,0 @@ -rm(list=ls()) -library(ggplot2) -library(Seurat) -library(reshape2) -library(TSCAN) -library(scattermore) -library(RColorBrewer) -suppressMessages(library(igraph)) -n.permute <- 1e3 -setwd("/Users/wenpinhou/Dropbox/trajectory_variability/hca/data/HCA/proc/integrate") -umap = readRDS('umap.rds') -pca <- as.matrix(umap@reductions$pca@cell.embeddings) -ctlevel <- data.frame(ct=c('HSC','MPP','LMPP','CMP','CLP','GMP','MEP',"Bcell","CD4Tcell","CD8Tcell",'NKcell','Mono','Ery'),level=c(1,2,3,3,4,4,4,5,5,5,5,5,5),immunepath=c(1,1,1,0,1,0,0,1,1,1,1,0,0),monopath=c(1,1,1,1,0,1,0,0,0,0,0,1,0),erypath=c(1,1,0,1,0,0,1,0,0,0,0,0,1),stringsAsFactors = F) -str(pca) -a = readRDS('/Users/wenpinhou/Dropbox/trajectory_variability/hca/data/HCA/proc/ct/sc.rds') -ct = data.frame(cell = names(a), celltype = a, stringsAsFactors = FALSE) - -mykmeans <- function(matrix, number.cluster = NA){ - ## cluster the rows - set.seed(12345) - library(parallel) - if (is.na(number.cluster)){ - maxclunum <- 20 - rss <- mclapply(1:maxclunum,function(clunum) { - tmp <- kmeans(matrix,clunum,iter.max = 1000) - tmp$betweenss/tmp$totss - },mc.cores=20) - rss <- unlist(rss) - x <- 1:maxclunum - optclunum <- which.min(sapply(1:maxclunum, function(i) { - x2 <- pmax(0, x - i) - sum(lm(rss ~ x + x2)$residuals^2) ## check this - })) - clu <- kmeans(matrix,optclunum) - } else { - clu <- kmeans(matrix, number.cluster) - } - return(clu) -} - -### determine numPC -set.seed(12345) -sdev <- apply(pca, 2, sd) -x <- 1:20 -optpoint <- which.min(sapply(2:20, function(i) { - x2 <- pmax(0, x - i) - sum(lm(sdev[1:20] ~ x + x2)$residuals^2) -})) -pcadim = optpoint + 1 -pr <- pca[,1:pcadim] # 2 - -### mclust -# mcl <- exprmclust(t(pr),cluster=clu,reduce=F) -mcl <- exprmclust(t(pr), reduce = F) -plotmclust(mcl, cell_point_size = 0.1) -str(mcl) -## find origin -pd = data.frame(x = pr[,1], y = pr[,2], clu = as.factor(mcl$clusterid)) -pd = cbind(pd, celltype = ct[match(rownames(pd), ct[,1]),2]) -tab <- table(pd[,3:4]) -tab <- tab/rowSums(tab) -pd <- melt(tab) -pd$clu <- factor(as.character(pd$clu), levels = seq(1,max(pd$clu))) - -tmp <- pd[pd$celltype == 'HSC', ] -origin.cluster <- as.numeric(tmp[which.max(tmp[,3]), 1]) - -## construct pseudotime -ord <- TSCANorder(mcl, startcluster = origin.cluster, listbranch = T,orderonly = T) -str(ord) -length(ord) -pt <- unlist(sapply(sapply(ord, length), function(i) seq(1, i))) -names(pt) <- unname(unlist(ord)) - - -## plot pseudotime -pd = data.frame(pc1 = pca[,1], pc2 = pca[,2], time = as.numeric(pt[rownames(pca)])) -library(scattermore) -library(RColorBrewer) -ggplot(data = pd, aes(x = pc1, y = pc2, color = time)) + - geom_scattermore() + - scale_color_gradient(low = 'yellow', high = 'blue') - -# ------------------------------------------------------- -# null distribution of Jaccard index, overlap coefficient -# ------------------------------------------------------- -js.null <- lapply(seq(1, length(ord)), function(i){ - b.ori <- ord[[i]] - tmp <- sapply(seq(1, 1e3), function(j){ - set.seed(j) - b.pm <- sample(rownames(pr), length(b.ori)) - length(intersect(b.pm, b.ori))/length(union(b.pm, b.ori)) - }) -}) - -par(mfrow = c(1,3)) -hist(js.null[[1]]) -hist(js.null[[2]]) -hist(js.null[[3]]) - -js.cut <- sapply(js.null, quantile, 0.99) - -oc.null <- lapply(seq(1, length(ord)), function(i){ - b.ori <- ord[[i]] - tmp <- sapply(seq(1, 1e3), function(j){ - set.seed(j) - b.pm <- sample(rownames(pr), length(b.ori)) - length(intersect(b.pm, b.ori))/min(length(b.pm), length(b.ori)) - }) -}) -par(mfrow = c(1,3)) -hist(oc.null[[1]]) -hist(oc.null[[2]]) -hist(oc.null[[3]]) - -oc.cut <- sapply(oc.null, quantile, 0.99) - -# ----------- -# permutation -# ----------- -corrlist <- jslist <- oclist <- list() -for (pmid in seq(1, n.permute)){ - ## boostrap cells - print(pmid) - set.seed(pmid) - pr.pm <- pr[sample(rownames(pr), nrow(pr), replace = TRUE),] - pr.pm <- pr.pm[!duplicated(rownames(pr.pm)),] - - # ## cluster cells - mcl.pm <- exprmclust(t(pr.pm), reduce = FALSE) ### - # plotmclust(mcl.pm, cell_point_size = 0.1) - - ## select origin cluster - pt.pm.mean<- tapply(pt[names(mcl.pm[['clusterid']])], list(mcl.pm[['clusterid']]), mean) - start.cluster <- names(which.min(pt.pm.mean)) - - ## construct pseudotime - ord.pm <- TSCANorder(mcl.pm, startcluster = start.cluster, listbranch = T,orderonly = T) - # str(ord.pm) - - ## plot pseudotime - pt.pm <- unlist(sapply(sapply(ord.pm, length), function(i) seq(1, i))) - names(pt.pm) <- unname(unlist(ord.pm)) - pd = data.frame(pc1 = pca[,1], pc2 = pca[,2], time = as.numeric(pt.pm[rownames(pca)])) - # ggplot(data = pd, aes(x = pc1, y = pc2, color = time)) + - # geom_scattermore() - - ## compare two MST - js <- sapply(seq(1, length(ord)), function(i){ - sapply(seq(1, length(ord.pm)), function(j){ - b.ori <- ord[[i]] - b.pm <- ord.pm[[j]] - js <- length(intersect(b.pm, b.ori))/length(union(b.pm, b.ori)) - }) - }) - oc <- sapply(seq(1, length(ord)), function(i){ - sapply(seq(1, length(ord.pm)), function(j){ - b.ori <- ord[[i]] - b.pm <- ord.pm[[j]] - oc <- length(intersect(b.pm, b.ori))/min(length(b.pm), length(b.ori)) - }) - }) - corr <- sapply(seq(1, length(ord)), function(i){ - sapply(seq(1, length(ord.pm)), function(j){ - ov = intersect(ord[[i]], ord.pm[[j]]) - cor(pt[ov], pt.pm[ov]) - }) - }) - corr[is.na(corr)] <- 0 - colnames(corr) <- colnames(oc) <- colnames(js) <- paste0('original', seq(1, length(ord))) - jslist[[pmid]] <- js - oclist[[pmid]] <- oc - corrlist[[pmid]] <- corr -} -saveRDS(jslist, '/Users/wenpinhou/Dropbox/trajectory_variability/tree_variability/result/pm_js.rds') -saveRDS(oclist, '/Users/wenpinhou/Dropbox/trajectory_variability/tree_variability/result/pm_oc.rds') - -saveRDS(corrlist, '/Users/wenpinhou/Dropbox/trajectory_variability/tree_variability/result/pm_oc.rds') - -jsm <- do.call(rbind, jslist) -ocm <- do.call(rbind, oclist) -par(mfrow = c(1,2)) -hist(jsm) -hist(ocm) - -res <- corr.score <- list() -for (i in seq(1, length(jslist))){ - js <- jslist[[i]] - js.binary <- sapply(seq(1,ncol(js)), function(c){ - (js[,c] > js.cut[c]) + 0 - }) - while (length(which(rowSums(js.binary) > 1)) > 0 | length(which(colSums(js.binary) > 1)) > 0){ - dup.id <- which(rowSums(js.binary) > 1) - if (length(dup.id) == 1){ - addid <- which.max(js[dup.id, ]) - js.binary[dup.id, ] <- 0 - js.binary[dup.id, addid] <- 1 - } else if (length(dup.id) > 1) { - for (dup.i in dup.id){ - addid <- which.max(js[dup.i, ]) - js.binary[dup.i, ] <- 0 - js.binary[dup.i, addid] <- 1 - } - } - - dup.id <- which(colSums(js.binary) > 1) - if (length(dup.id) == 1){ - addid <- which.max(js[, dup.id]) - js.binary[dup.id, ] <- 0 - js.binary[addid, dup.id] <- 1 - } else if (length(dup.id) > 1) { - for (dup.i in dup.id){ - addid <- which.max(js[, dup.id]) - js.binary[, dup.id] <- 0 - js.binary[addid, dup.id] <- 1 - } - } - } - - - corr.score[[i]] <- corrlist[[i]] * js.binary - js.melt <- melt(js.binary) - js.melt <- js.melt[js.melt[,3]!=0,] - res[[i]] <- as.character(js.melt[,2]) -} -res <- unlist(res) -js.perc <- table(res)/n.permute -saveRDS(js.perc, '/Users/wenpinhou/Dropbox/trajectory_variability/tree_variability/result/js_percentage.rds') - -corr.score.m <- do.call(rbind, corr.score) -corr.score.v <- colSums(corr.score.m)/n.permute -saveRDS(corr.score.v, '/Users/wenpinhou/Dropbox/trajectory_variability/tree_variability/result/corr_score.rds') - -res <- sapply(seq(1,length(oclist)), function(i){ - oc <- oclist[[i]] - oc.binary <- sapply(seq(1,ncol(oc)), function(c){ - (oc[,c] > oc.cut[c]) + 0 - }) - while (length(which(rowSums(oc.binary) > 1)) > 0 | length(which(colSums(oc.binary) > 1)) > 0){ - dup.id <- which(rowSums(oc.binary) > 1) - if (length(dup.id) == 1){ - addid <- which.max(oc[dup.id, ]) - oc.binary[dup.id, ] <- 0 - oc.binary[dup.id, addid] <- 1 - } else if (length(dup.id) > 1) { - for (dup.i in dup.id){ - addid <- which.max(oc[dup.i, ]) - oc.binary[dup.i, ] <- 0 - oc.binary[dup.i, addid] <- 1 - } - } - dup.id <- which(colSums(oc.binary) > 1) - if (length(dup.id) == 1){ - addid <- which.max(oc[, dup.id]) - oc.binary[, dup.id] <- 0 - oc.binary[addid, dup.id] <- 1 - } else if (length(dup.id) > 1) { - for (dup.i in dup.id){ - addid <- which.max(oc[, dup.i]) - oc.binary[, dup.i] <- 0 - oc.binary[addid, dup.i] <- 1 - } - } - } - oc.melt <- melt(oc.binary) - oc.melt <- oc.melt[oc.melt[,3]!=0,] - as.character(oc.melt[,2]) -}) -res <- unlist(res) -oc.perc <- table(res)/n.permute -sort((js.perc + oc.perc)/2) -saveRDS(oc.perc, '/Users/wenpinhou/Dropbox/trajectory_variability/tree_variability/result/oc_percentage.rds') - - diff --git a/hca_bone_marrow_data_analysis/tree_variability/auto_pc_auto_nclu_module_3traj/code/02_samples_reproducibility.R b/hca_bone_marrow_data_analysis/tree_variability/auto_pc_auto_nclu_module_3traj/code/02_samples_reproducibility.R deleted file mode 100644 index 82056b3..0000000 --- a/hca_bone_marrow_data_analysis/tree_variability/auto_pc_auto_nclu_module_3traj/code/02_samples_reproducibility.R +++ /dev/null @@ -1,452 +0,0 @@ -rm(list=ls()) -library(ggplot2) -library(Seurat) -library(reshape2) -library(TSCAN) -library(scattermore) -library(RColorBrewer) -suppressMessages(library(igraph)) -n.permute <- 1e3 -max.clunum <- 50 -setwd("/Users/wenpinhou/Dropbox/trajectory_variability/hca/data/HCA/proc/integrate") -umap = readRDS('umap.rds') -pca <- as.matrix(umap@reductions$pca@cell.embeddings) -ctlevel <- data.frame(ct=c('HSC','MPP','LMPP','CMP','CLP','GMP','MEP',"Bcell","CD4Tcell","CD8Tcell",'NKcell','Mono','Ery'),level=c(1,2,3,3,4,4,4,5,5,5,5,5,5),immunepath=c(1,1,1,0,1,0,0,1,1,1,1,0,0),monopath=c(1,1,1,1,0,1,0,0,0,0,0,1,0),erypath=c(1,1,0,1,0,0,1,0,0,0,0,0,1),stringsAsFactors = F) -str(pca) -a = readRDS('/Users/wenpinhou/Dropbox/trajectory_variability/hca/data/HCA/proc/ct/sc.rds') -ct = data.frame(cell = names(a), celltype = a, stringsAsFactors = FALSE) - -mykmeans <- function(matrix, number.cluster = NA){ - ## cluster the rows - set.seed(12345) - library(parallel) - if (is.na(number.cluster)){ - maxclunum <- 20 - rss <- mclapply(1:maxclunum,function(clunum) { - tmp <- kmeans(matrix,clunum,iter.max = 1000) - tmp$betweenss/tmp$totss - },mc.cores=20) - rss <- unlist(rss) - x <- 1:maxclunum - optclunum <- which.min(sapply(1:maxclunum, function(i) { - x2 <- pmax(0, x - i) - sum(lm(rss ~ x + x2)$residuals^2) ## check this - })) - clu <- kmeans(matrix,optclunum) - } else { - clu <- kmeans(matrix, number.cluster) - } - return(clu) -} - -### determine numPC -set.seed(12345) -sdev <- apply(pca, 2, sd) -x <- 1:max.clunum -optpoint <- which.min(sapply(2:max.clunum, function(i) { - x2 <- pmax(0, x - i) - sum(lm(sdev[1:max.clunum] ~ x + x2)$residuals^2) -})) -pcadim = optpoint + 1 -pr <- pca[,1:pcadim] # 7 - -## clustering -clu <- mykmeans(pr, number.cluster = 14)$cluster -pd = data.frame(x = pr[,1], y = pr[,2], clu = as.factor(clu[rownames(pr)])) - -# mypalette = colorRampPalette(brewer.pal(9,'Set1')) -# ggplot(data = pd, aes(x = x, y = y, color = clu)) + -# geom_scattermore()+ -# scale_color_manual(values = mypalette(14))+ -# theme_classic() + xlab('UMAP1') + ylab('UMAP2') - -## cell type composition in clusters -pd = cbind(pd, celltype = ct[match(rownames(pd), ct[,1]),2]) -tab <- table(pd[,3:4]) -tab <- tab/rowSums(tab) -pd <- melt(tab) -pd$clu <- factor(as.character(pd$clu), levels = seq(1,max(pd$clu))) - -# ggplot(data = pd) + -# geom_bar(aes(x = clu, y = value, fill = celltype), stat = 'identity', position = 'dodge') + -# theme_classic() + -# ylab('Celltype Proportion') + -# scale_fill_manual(values = mypalette(length(unique(pd$celltype)))) - -### mclust -mcl <- exprmclust(t(pr),cluster=clu,reduce=F) -# mcl <- exprmclust(t(pr), reduce = F) -# plotmclust(mcl, cell_point_size = 0.1) -# str(mcl) - -# -------------------- -# construct pseudotime -# -------------------- -## find origin -pd = data.frame(x = pr[,1], y = pr[,2], clu = as.factor(mcl$clusterid)) -pd = cbind(pd, celltype = ct[match(rownames(pd), ct[,1]),2]) -tab <- table(pd[,3:4]) -tab <- tab/rowSums(tab) -pd <- melt(tab) -pd$clu <- factor(as.character(pd$clu), levels = seq(1,max(pd$clu))) -tmp <- pd[pd$celltype == 'HSC', ] -origin.cluster <- as.numeric(tmp[which.max(tmp[,3]), 1]) - -## construct pseudotime -ord <- TSCANorder(mcl, startcluster = origin.cluster, listbranch = T,orderonly = T) -str(ord) -length(ord) -pt <- unlist(sapply(sapply(ord, length), function(i) seq(1, i))) -names(pt) <- unname(unlist(ord)) - -# ## plot pseudotime -# pd = data.frame(pc1 = pca[,1], pc2 = pca[,2], time = as.numeric(pt[rownames(pca)])) -# library(scattermore) -# library(RColorBrewer) -# ggplot(data = pd, aes(x = pc1, y = pc2, color = time)) + -# geom_scattermore() + -# scale_color_gradient(low = 'yellow', high = 'blue') - -# ------------------------------------------------------------ -# get candidate branches to test reproducibility, 20200726 >> -# ------------------------------------------------------------ -findbranch <- function(mst, order, origin){ - deg <- degree(mst) - vertex <- names(deg[which(deg > 2 | deg == 1)]) - if (!origin %in% vertex) vertex <- c(origin, vertex) - eg <- expand.grid(1:length(vertex), 1:length(vertex)) - eg <- eg[eg[,1]0] - - allbranch <- gsub('backbone ', '', gsub('branch: ', '', names(order))) - allbranch <- sapply(allbranch, function(i) strsplit(i, ',')[[1]]) - allbranch <- paste0(names(allbranch), collapse = ' ') - newbranch <-sapply(tmpbranch, function(i) { - tmp <- paste0(i, collapse = ',') - if (!grepl(tmp, allbranch)){ - rev(i) - } else { - i - } - }) - return(newbranch) -} -newbranch <- findbranch(mst = mcl$MSTtree, order = ord, origin = origin.cluster) - - -# ------------------------------------------------------- -# null distribution of Jaccard index, overlap coefficient -# ------------------------------------------------------- -## add here --------------->>>>>> -## for samples -## add here ---------------<<<<<<< -js.null <- lapply(seq(1, length(newbranch)), function(i){ - b.ori <- unlist(sapply(newbranch[[i]], function(c) names(mcl$clusterid[mcl$clusterid == c]))) - b.ori.alls <- gsub(':.*', '', b.ori) - alls <- gsub(':.*', '', rownames(pr)) - tmp <- mclapply(seq(1, 1e3), function(j){ - set.seed(j) - b.pm <- sample(rownames(pr), length(b.ori)) - b.pm.alls <- gsub(':.*', '', b.pm) - tmpp <- sapply(unique(alls), function(s){ - b.pm.s <- b.pm[b.pm.alls == s] - b.ori.s <- b.ori[b.ori.alls == s] - length(intersect(b.pm.s, b.ori.s))/length(union(b.pm.s, b.ori.s)) - }) - },mc.cores = detectCores()-2) - tmp <- do.call(rbind,tmp) -}) -js.cut <- sapply(js.null, function(i) apply(i, 2, quantile, 0.99)) -# ------------------ - -oc.null <- lapply(seq(1, length(newbranch)), function(i){ - b.ori <- unlist(sapply(newbranch[[i]], function(c) names(mcl$clusterid[mcl$clusterid == c]))) - b.ori.alls <- gsub(':.*', '', b.ori) - tmp <- mclapply(seq(1, 1e3), function(j){ - set.seed(j) - b.pm <- sample(rownames(pr), length(b.ori)) - b.pm.alls <- gsub(':.*', '', b.pm) - tmpp <- sapply(unique(alls), function(s){ - b.pm.s <- b.pm[b.pm.alls == s] - b.ori.s <- b.ori[b.ori.alls == s] - length(intersect(b.pm.s, b.ori.s))/min(length(b.pm.s), length(b.ori.s)) - }) - },mc.cores = detectCores()-2) - tmp <- do.call(rbind,tmp) -}) -oc.cut <- sapply(oc.null, function(i) apply(i, 2, quantile, 0.99)) - -# ----------- -# permutation -# ----------- -corrlist.alls <- jslist.alls <- oclist.alls <- list() -n.permute = 100 -for (pmid in seq(1, n.permute)){ - ## boostrap cells - print(pmid) - set.seed(pmid) - pr.pm <- pr[sample(rownames(pr), nrow(pr), replace = TRUE),] - pr.pm <- pr.pm[!duplicated(rownames(pr.pm)),] - - # ## cluster cells - clu <- mykmeans(pr.pm, number.cluster = 14)$cluster ### - - pd = data.frame(x = pr[names(clu),1], y = pr[names(clu),2], clu = as.factor(clu)) - pd.text.x = tapply(pd[,1], list(pd$clu), mean) - pd.text.y = tapply(pd[,2], list(pd$clu), mean) - pd.text = data.frame(x = pd.text.x, y = pd.text.y, clu = names(pd.text.x)) - pd.text[14,1:2] = c(pd.text[14,1] + 2, pd.text[14,2] + 1) - - # ggplot() + - # geom_scattermore(data = pd, aes(x = x, y = y, color = clu))+ - # scale_color_manual(values = mypalette(14))+ - # theme_classic() + xlab('UMAP1') + ylab('UMAP2') + - # geom_text(data = pd.text, aes(x = x, y = y, label = clu)) - - - ## cell type composition in clusters - # pd = cbind(pd, celltype = ct[match(rownames(pd), ct[,1]),2]) - # tab <- table(pd[,3:4]) - # tab <- tab/rowSums(tab) - # pd <- melt(tab) - # pd$clu <- factor(as.character(pd$clu), levels = seq(1,max(pd$clu))) - # ggplot(data = pd) + - # geom_bar(aes(x = clu, y = value, fill = celltype), stat = 'identity', position = 'dodge') + - # theme_classic() + - # ylab('Celltype Proportion') + - # scale_fill_manual(values = mypalette(length(unique(pd$celltype)))) - - # build pseudotime - mcl.pm <- exprmclust(t(pr.pm), cluster = clu, reduce = FALSE) ### - # plotmclust(mcl.pm, cell_point_size = 0.1) - - ## select origin cluster - pt.pm.mean<- tapply(pt[names(mcl.pm[['clusterid']])], list(mcl.pm[['clusterid']]), mean) - start.cluster <- names(which.min(pt.pm.mean)) - - ## construct pseudotime - ord.pm <- TSCANorder(mcl.pm, startcluster = start.cluster, listbranch = T,orderonly = T) - # str(ord.pm) - - ## plot pseudotime - pt.pm <- unlist(sapply(sapply(ord.pm, length), function(i) seq(1, i))) - names(pt.pm) <- unname(unlist(ord.pm)) - pd = data.frame(pc1 = pca[,1], pc2 = pca[,2], time = as.numeric(pt.pm[rownames(pca)])) - # ggplot(data = pd, aes(x = pc1, y = pc2, color = time)) + - # geom_scattermore() + theme_classic() - - # get candidate branches - newbranch.pm <- findbranch(mst = mcl.pm$MSTtree, order = ord.pm, origin = start.cluster) - - ## compare two MST - js <- sapply(seq(1, length(newbranch)), function(i){ - print('i') - print(i) - id <- which(sapply(paste0(names(ord),','), function(k) grepl(paste0(paste0(newbranch[[i]], collapse = ','),','), k)))[1] - cells <- ord[[id]] - b.ori <- intersect(unlist(sapply(newbranch[[i]], function(k) names(mcl$clusterid)[mcl$clusterid == k])), cells) - b.ori.alls <- gsub(':.*', '', b.ori) - tmp <- mclapply(seq(1, length(newbranch.pm)), function(j){ - print(j) - id <- which(sapply(paste0(names(ord.pm),','), function(k) grepl(paste0(paste0(newbranch.pm[[j]], collapse = ','),','), k)))[1] - cells <- ord.pm[[id]] - b.pm <- intersect(unlist(sapply(newbranch.pm[[j]], function(k) names(mcl.pm$clusterid)[mcl.pm$clusterid == k])), cells) - b.pm.alls <- gsub(':.*', '', b.pm) - # js <- length(intersect(b.pm, b.ori))/length(union(b.pm, b.ori)) - tmpp <- sapply(unique(alls), function(s){ - b.pm.s <- b.pm[b.pm.alls == s] - b.ori.s <- b.ori[b.ori.alls == s] - length(intersect(b.pm.s, b.ori.s))/length(union(b.pm.s, b.ori.s)) - }) - },mc.cores = detectCores()-2) - tmp <- do.call(rbind,tmp) - rownames(tmp) <- paste0('branch.pm', seq(1, length(newbranch.pm))) - tmp - }, simplify = FALSE) - names(js) <- paste0('branch', seq(1, length(newbranch))) - ###### ===================================== - oc <- sapply(seq(1, length(newbranch)), function(i){ - id <- which(sapply(paste0(names(ord),','), function(k) grepl(paste0(paste0(newbranch[[i]], collapse = ','),','), k)))[1] - cells <- ord[[id]] - b.ori <- intersect(unlist(sapply(newbranch[[i]], function(k) names(mcl$clusterid)[mcl$clusterid == k])), cells) - b.ori.alls <- gsub(':.*', '', b.ori) - tmp <- mclapply(seq(1, length(newbranch.pm)), function(j){ - id <- which(sapply(paste0(names(ord.pm),','), function(k) grepl(paste0(paste0(newbranch.pm[[j]], collapse = ','),','), k)))[1] - cells <- ord.pm[[id]] - b.pm <- intersect(unlist(sapply(newbranch.pm[[j]], function(k) names(mcl.pm$clusterid)[mcl.pm$clusterid == k])), cells) - b.pm.alls <- gsub(':.*', '', b.pm) - # oc <- length(intersect(b.pm, b.ori))/min(length(b.pm), length(b.ori)) - tmpp <- sapply(unique(alls), function(s){ - b.pm.s <- b.pm[b.pm.alls == s] - b.ori.s <- b.ori[b.ori.alls == s] - length(intersect(b.pm.s, b.ori.s))/min(length(b.pm.s), length(b.ori.s)) - }) - },mc.cores = detectCores()-2) - tmp <- do.call(rbind,tmp) - rownames(tmp) <- paste0('branch.pm', seq(1, length(newbranch.pm))) - tmp - }, simplify = FALSE) - names(oc) <- paste0('branch', seq(1, length(newbranch))) - - - corr <- sapply(seq(1, length(newbranch)), function(i){ - id <- which(sapply(paste0(names(ord),','), function(k) grepl(paste0(paste0(newbranch[[i]], collapse = ','),','), k)))[1] - cells <- ord[[id]] - b.ori <- intersect(unlist(sapply(newbranch[[i]], function(k) names(mcl$clusterid)[mcl$clusterid == k])), cells) - b.ori.alls <- gsub(':.*', '', b.ori) - tmp <- mclapply(seq(1, length(newbranch.pm)), function(j){ - id <- which(sapply(paste0(names(ord.pm),','), function(k) grepl(paste0(paste0(newbranch.pm[[j]], collapse = ','),','), k)))[1] - cells <- ord.pm[[id]] - b.pm <- intersect(unlist(sapply(newbranch.pm[[j]], function(k) names(mcl.pm$clusterid)[mcl.pm$clusterid == k])), cells) - b.pm.alls <- gsub(':.*', '', b.pm) - # ov = intersect(b.ori, b.pm) - # cor(pt[ov], pt.pm[ov]) - tmpp <- sapply(unique(alls), function(s){ - b.pm.s <- b.pm[b.pm.alls == s] - b.ori.s <- b.ori[b.ori.alls == s] - ov = intersect(b.ori.s, b.pm.s) - cor(pt[ov], pt.pm[ov]) - }) - }, mc.cores = detectCores()-2) - tmp <- do.call(rbind, tmp) - rownames(tmp) <- paste0('branch.pm', seq(1, length(newbranch.pm))) - tmp[is.na(tmp)] <- 0 - tmp - }, simplify = FALSE) - # corr[is.na(corr)] <- 0 - names(corr) <- paste0('branch', seq(1, length(newbranch))) - # colnames(corr) <- colnames(oc) <- colnames(js) <- paste0('original', seq(1, length(newbranch))) - jslist.alls[[pmid]] <- js - oclist.alls[[pmid]] <- oc - corrlist.alls[[pmid]] <- corr -} -saveRDS(jslist.alls, '/Users/wenpinhou/Dropbox/trajectory_variability/tree_variability/result/samples/pm_js_alls.rds') -saveRDS(oclist.alls, '/Users/wenpinhou/Dropbox/trajectory_variability/tree_variability/result/samples/pm_oc_alls.rds') -saveRDS(corrlist.alls, '/Users/wenpinhou/Dropbox/trajectory_variability/tree_variability/samples/result/pm_oc_alls.rds') - -# jsm <- do.call(rbind, jslist) -# ocm <- do.call(rbind, oclist) -# par(mfrow = c(1,2)) -# hist(jsm) -# hist(ocm) -s = unique(alls)[1] -df.alls <- lapply(unique(alls), function(s){ - jslist = sapply(jslist.alls, function(i){ - sapply(i, function(ii) ii[,s]) - }, simplify = FALSE) - oclist = sapply(oclist.alls, function(i){ - sapply(i, function(ii) ii[,s]) - }, simplify = FALSE) - corrlist = sapply(corrlist.alls, function(i){ - sapply(i, function(ii) ii[,s]) - }, simplify = FALSE) - - res <- corr.score <- list() - for (i in seq(1, length(jslist))){ - print(i) - js <- jslist[[i]] - js.binary <- sapply(seq(1,ncol(js)), function(c){ - (js[,c] > js.cut[c]) + 0 - }) - while (length(which(rowSums(js.binary) > 1)) > 0 | length(which(colSums(js.binary) > 1)) > 0){ - dup.id <- which(rowSums(js.binary) > 1) - if (length(dup.id) == 1){ - addid <- which.max(js[dup.id, ]) - js.binary[dup.id, ] <- 0 - js.binary[dup.id, addid] <- 1 - } else if (length(dup.id) > 1) { - for (dup.i in dup.id){ - print(dup.i) - addid <- which.max(js[dup.i, ]) - js.binary[dup.i, ] <- 0 - js.binary[dup.i, addid] <- 1 - } - } - - dup.id <- which(colSums(js.binary) > 1) - if (length(dup.id) == 1){ - addid <- which.max(js[, dup.id]) - js.binary[, dup.id] <- 0 - js.binary[addid, dup.id] <- 1 - } else if (length(dup.id) > 1) { - for (dup.i in dup.id){ - addid <- which.max(js[, dup.i]) - js.binary[, dup.i] <- 0 - js.binary[addid, dup.i] <- 1 - } - } - } - - corr.score[[i]] <- corrlist[[i]] * js.binary - js.melt <- melt(js.binary) - js.melt <- js.melt[js.melt[,3]!=0,] - res[[i]] <- as.character(js.melt[,2]) - } - res <- unlist(res) - js.perc <- table(res)/n.permute - names(js.perc) <- newbranch - # saveRDS(js.perc, '/Users/wenpinhou/Dropbox/trajectory_variability/tree_variability/result/samples/js_percentage.rds') - - corr.score.m <- do.call(rbind, corr.score) - corr.score.v <- colSums(corr.score.m)/n.permute - names(corr.score.v) <- newbranch - # saveRDS(corr.score.v, '/Users/wenpinhou/Dropbox/trajectory_variability/tree_variability/result/samples/corr_score.rds') - - res <- sapply(seq(1,length(oclist)), function(i){ - print(i) - oc <- oclist[[i]] - oc.binary <- sapply(seq(1,ncol(oc)), function(c){ - (oc[,c] > oc.cut[c]) + 0 - }) - while (length(which(rowSums(oc.binary) > 1)) > 0 | length(which(colSums(oc.binary) > 1)) > 0){ - dup.id <- which(rowSums(oc.binary) > 1) - if (length(dup.id) == 1){ - addid <- which.max(oc[dup.id, ]) - oc.binary[dup.id, ] <- 0 - oc.binary[dup.id, addid] <- 1 - } else if (length(dup.id) > 1) { - for (dup.i in dup.id){ - addid <- which.max(oc[dup.i, ]) - oc.binary[dup.i, ] <- 0 - oc.binary[dup.i, addid] <- 1 - } - } - dup.id <- which(colSums(oc.binary) > 1) - if (length(dup.id) == 1){ - addid <- which.max(oc[, dup.id]) - oc.binary[, dup.id] <- 0 - oc.binary[addid, dup.id] <- 1 - } else if (length(dup.id) > 1) { - for (dup.i in dup.id){ - addid <- which.max(oc[, dup.i]) - oc.binary[, dup.i] <- 0 - oc.binary[addid, dup.i] <- 1 - } - } - } - oc.melt <- melt(oc.binary) - oc.melt <- oc.melt[oc.melt[,3]!=0,] - as.character(oc.melt[,2]) - }) - res <- unlist(res) - oc.perc <- table(res)/n.permute - names(oc.perc) <- newbranch - sort((js.perc + oc.perc)/2) - - df <- data.frame(js.perc = js.perc, oc.perc = oc.perc, corr.score.v = corr.score.v) - df <- df[, c(2,4,5)] - -# saveRDS(oc.perc, '/Users/wenpinhou/Dropbox/trajectory_variability/tree_variability/result/samples/oc_percentage.rds') - -}) -names(df.alls) <- unique(alls) -df.alls[order(names(df.alls))] - - - diff --git a/hca_bone_marrow_data_analysis/tree_variability/auto_pc_auto_nclu_module_3traj/code/03_try_to_build_module.R b/hca_bone_marrow_data_analysis/tree_variability/auto_pc_auto_nclu_module_3traj/code/03_try_to_build_module.R deleted file mode 100644 index c822909..0000000 --- a/hca_bone_marrow_data_analysis/tree_variability/auto_pc_auto_nclu_module_3traj/code/03_try_to_build_module.R +++ /dev/null @@ -1,443 +0,0 @@ -rm(list=ls()) -library(ggplot2) -library(Seurat) -library(reshape2) -library(TSCAN) -library(scattermore) -library(RColorBrewer) -suppressMessages(library(igraph)) -n.permute <- 3 -max.clunum <- 50 -setwd("/Users/wenpinhou/Dropbox/trajectory_variability/hca/data/HCA/proc/integrate") - -# -------------------------------------------------------------- -# input: seurat integrated object including: -# umap, pca -# celltype: a dataframe, col 1 is cell name, col 2 is cell type -# origin: the origin cell type -# -------------------------------------------------------------- -# read in data -umap = readRDS('umap.rds') -pca <- as.matrix(umap@reductions$pca@cell.embeddings) -# ctlevel <- data.frame(ct=c('HSC','MPP','LMPP','CMP','CLP','GMP','MEP',"Bcell","CD4Tcell","CD8Tcell",'NKcell','Mono','Ery'),level=c(1,2,3,3,4,4,4,5,5,5,5,5,5),immunepath=c(1,1,1,0,1,0,0,1,1,1,1,0,0),monopath=c(1,1,1,1,0,1,0,0,0,0,0,1,0),erypath=c(1,1,0,1,0,0,1,0,0,0,0,0,1),stringsAsFactors = F) -str(pca) -a = readRDS('/Users/wenpinhou/Dropbox/trajectory_variability/hca/data/HCA/proc/ct/sc.rds') -ct = data.frame(cell = names(a), celltype = a, stringsAsFactors = FALSE) -alls <- sub(':.*', '', names(a)) -names(alls) <- names(a) - -mykmeans <- function(matrix, number.cluster = NA){ - ## cluster the rows - set.seed(12345) - library(parallel) - if (is.na(number.cluster)){ - maxclunum <- 20 - rss <- mclapply(1:maxclunum,function(clunum) { - tmp <- kmeans(matrix,clunum,iter.max = 1000) - tmp$betweenss/tmp$totss - },mc.cores=20) - rss <- unlist(rss) - x <- 1:maxclunum - optclunum <- which.min(sapply(1:maxclunum, function(i) { - x2 <- pmax(0, x - i) - sum(lm(rss ~ x + x2)$residuals^2) ## check this - })) - clu <- kmeans(matrix,optclunum) - } else { - clu <- kmeans(matrix, number.cluster) - } - return(clu) -} - -### determine numPC -set.seed(12345) -sdev <- apply(pca, 2, sd) -x <- 1:max.clunum -optpoint <- which.min(sapply(2:max.clunum, function(i) { - x2 <- pmax(0, x - i) - sum(lm(sdev[1:max.clunum] ~ x + x2)$residuals^2) -})) -pcadim = optpoint + 1 -pr <- pca[,1:pcadim] # 7 - -## clustering -clu <- mykmeans(pr, number.cluster = 14)$cluster -# pd = data.frame(x = pr[,1], y = pr[,2], clu = as.factor(clu[rownames(pr)])) -# mypalette = colorRampPalette(brewer.pal(9,'Set1')) -# ggplot(data = pd, aes(x = x, y = y, color = clu)) + -# geom_scattermore()+ -# scale_color_manual(values = mypalette(14))+ -# theme_classic() + xlab('UMAP1') + ylab('UMAP2') - -# ## cell type composition in clusters -# pd = cbind(pd, celltype = ct[match(rownames(pd), ct[,1]),2]) -# tab <- table(pd[,3:4]) -# tab <- tab/rowSums(tab) -# pd <- melt(tab) -# pd$clu <- factor(as.character(pd$clu), levels = seq(1,max(pd$clu))) -# -# ggplot(data = pd) + -# geom_bar(aes(x = clu, y = value, fill = celltype), stat = 'identity', position = 'dodge') + -# theme_classic() + -# ylab('Celltype Proportion') + -# scale_fill_manual(values = mypalette(length(unique(pd$celltype)))) - -### mclust -mcl <- exprmclust(t(pr),cluster=clu,reduce=F) -# mcl <- exprmclust(t(pr), reduce = F) -# plotmclust(mcl, cell_point_size = 0.1) -# str(mcl) - -# -------------------- -# construct pseudotime -# -------------------- -## find origin -pd = data.frame(x = pr[,1], y = pr[,2], clu = as.factor(mcl$clusterid)) -pd = cbind(pd, celltype = ct[match(rownames(pd), ct[,1]),2]) -tab <- table(pd[,3:4]) -tab <- tab/rowSums(tab) -pd <- melt(tab) -pd$clu <- factor(as.character(pd$clu), levels = seq(1,max(pd$clu))) -tmp <- pd[pd$celltype == 'HSC', ] -origin.cluster <- as.numeric(tmp[which.max(tmp[,3]), 1]) - -## construct pseudotime -ord <- TSCANorder(mcl, startcluster = origin.cluster, listbranch = T,orderonly = T) -str(ord) -length(ord) -pt <- unlist(sapply(sapply(ord, length), function(i) seq(1, i))) -names(pt) <- unname(unlist(ord)) - -# ## plot pseudotime -# pd = data.frame(pc1 = pca[,1], pc2 = pca[,2], time = as.numeric(pt[rownames(pca)])) -# library(scattermore) -# library(RColorBrewer) -# ggplot(data = pd, aes(x = pc1, y = pc2, color = time)) + -# geom_scattermore() + -# scale_color_gradient(low = 'yellow', high = 'blue') - -# ------------------------------------------------------------ -# get candidate branches to test reproducibility, 20200726 >> -# ------------------------------------------------------------ -findbranch <- function(mst, order, origin){ - deg <- degree(mst) - vertex <- names(deg[which(deg > 2 | deg == 1)]) - if (!origin %in% vertex) vertex <- c(origin, vertex) - eg <- expand.grid(1:length(vertex), 1:length(vertex)) - eg <- eg[eg[,1]0] - - allbranch <- gsub('backbone ', '', gsub('branch: ', '', names(order))) - allbranch <- sapply(allbranch, function(i) strsplit(i, ',')[[1]]) - allbranch <- paste0(names(allbranch), collapse = ' ') - newbranch <-sapply(tmpbranch, function(i) { - tmp <- paste0(i, collapse = ',') - if (!grepl(tmp, allbranch)){ - rev(i) - } else { - i - } - }) - return(newbranch) -} -newbranch <- findbranch(mst = mcl$MSTtree, order = ord, origin = origin.cluster) - -# ----------------------------------------------------- -# Evaluate robustness of tree branches using resampling -# ----------------------------------------------------- - -# null distribution of Jaccard index, overlap coefficient - -js.null <- lapply(seq(1, length(newbranch)), function(i) { - b.ori <- unlist(sapply(newbranch[[i]], function(c) names(mcl$clusterid[mcl$clusterid == c]))) - tmp <- sapply(seq(1, 1e3), function(j){ - set.seed(j) - b.pm <- sample(rownames(pr), length(b.ori)) - length(intersect(b.pm, b.ori))/length(union(b.pm, b.ori)) - }) -}) - -# par(mfrow = c(2,ceiling(length(js.null)/2))) -# for (i in js.null) hist(i, xlab = 'js', main = '', breaks = 50) - -js.cut <- sapply(js.null, quantile, 0.99) - -oc.null <- lapply(seq(1, length(newbranch)), function(i){ - b.ori <- unlist(sapply(newbranch[[i]], function(c) names(mcl$clusterid[mcl$clusterid == c]))) - tmp <- sapply(seq(1, 1e3), function(j){ - set.seed(j) - b.pm <- sample(rownames(pr), length(b.ori)) - length(intersect(b.pm, b.ori))/min(length(b.pm), length(b.ori)) - }) -}) -# par(mfrow = c(2,ceiling(length(oc.null)/2))) -# for (i in oc.null) hist(i, xlab = 'oc', main = '', breaks = 50) -oc.cut <- sapply(oc.null, quantile, 0.99) - -# permutation - -get_binary <- function(js){ - js.binary <- sapply(seq(1,ncol(js)), function(c){ - (js[,c] > js.cut[c]) + 0 - }) - while (length(which(rowSums(js.binary) > 1)) > 0 | length(which(colSums(js.binary) > 1)) > 0){ - dup.id <- which(rowSums(js.binary) > 1) - if (length(dup.id) == 1){ - addid <- which.max(js[dup.id, ]) - js.binary[dup.id, ] <- 0 - js.binary[dup.id, addid] <- 1 - } else if (length(dup.id) > 1) { - for (dup.i in dup.id){ - addid <- which.max(js[dup.i, ]) - js.binary[dup.i, ] <- 0 - js.binary[dup.i, addid] <- 1 - } - } - - dup.id <- which(colSums(js.binary) > 1) - if (length(dup.id) == 1){ - addid <- which.max(js[, dup.id]) - js.binary[, dup.id] <- 0 - js.binary[addid, dup.id] <- 1 - } else if (length(dup.id) > 1) { - for (dup.i in dup.id){ - addid <- which.max(js[, dup.i]) - js.binary[, dup.i] <- 0 - js.binary[addid, dup.i] <- 1 - } - } - } - return(js.binary) -} - -ctcomplist <- reproduce <- corr.score <- corrlist <- jslist <- oclist <- list() - -for (pmid in seq(1, n.permute)){ - ## boostrap cells - print(pmid) - set.seed(pmid) - pr.pm <- pr[sample(rownames(pr), nrow(pr), replace = TRUE),] - pr.pm <- pr.pm[!duplicated(rownames(pr.pm)),] - - ## cluster cells - clu <- mykmeans(pr.pm, number.cluster = 14)$cluster ### - - # --- check if these codes are necessary <<<<<<<<<<<<<<<< - pd = data.frame(x = pr[names(clu),1], y = pr[names(clu),2], clu = as.factor(clu)) - pd.text.x = tapply(pd[,1], list(pd$clu), mean) - pd.text.y = tapply(pd[,2], list(pd$clu), mean) - pd.text = data.frame(x = pd.text.x, y = pd.text.y, clu = names(pd.text.x)) - pd.text[14,1:2] = c(pd.text[14,1] + 2, pd.text[14,2] + 1) - # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> - # ggplot() + - # geom_scattermore(data = pd, aes(x = x, y = y, color = clu))+ - # scale_color_manual(values = mypalette(14))+ - # theme_classic() + xlab('UMAP1') + ylab('UMAP2') + - # geom_text(data = pd.text, aes(x = x, y = y, label = clu)) - - - ## cell type composition in clusters - # pd = cbind(pd, celltype = ct[match(rownames(pd), ct[,1]),2]) - # tab <- table(pd[,3:4]) - # tab <- tab/rowSums(tab) - # pd <- melt(tab) - # pd$clu <- factor(as.character(pd$clu), levels = seq(1,max(pd$clu))) - # ggplot(data = pd) + - # geom_bar(aes(x = clu, y = value, fill = celltype), stat = 'identity', position = 'dodge') + - # theme_classic() + - # ylab('Celltype Proportion') + - # scale_fill_manual(values = mypalette(length(unique(pd$celltype)))) - - ## build pseudotime - mcl.pm <- exprmclust(t(pr.pm), cluster = clu, reduce = FALSE) ### - # plotmclust(mcl.pm, cell_point_size = 0.1) - - ## select origin cluster - pt.pm.mean<- tapply(pt[names(mcl.pm[['clusterid']])], list(mcl.pm[['clusterid']]), mean) - start.cluster <- names(which.min(pt.pm.mean)) - - ## construct pseudotime - ord.pm <- TSCANorder(mcl.pm, startcluster = start.cluster, listbranch = T,orderonly = T) - # str(ord.pm) - - pt.pm <- unlist(sapply(sapply(ord.pm, length), function(i) seq(1, i))) - names(pt.pm) <- unname(unlist(ord.pm)) - # --- check if these codes are necessary <<<<<<<<<<<<<<<< - ## plot pseudotime - - pd = data.frame(pc1 = pca[,1], pc2 = pca[,2], time = as.numeric(pt.pm[rownames(pca)])) - # ggplot(data = pd, aes(x = pc1, y = pc2, color = time)) + - # geom_scattermore() + theme_classic() - # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> - - # get candidate branches - newbranch.pm <- findbranch(mst = mcl.pm$MSTtree, order = ord.pm, origin = start.cluster) - - ## compare two MST - js <- sapply(seq(1, length(newbranch)), function(i){ - id <- which(sapply(paste0(names(ord),','), function(k) grepl(paste0(paste0(newbranch[[i]], collapse = ','),','), k)))[1] - cells <- ord[[id]] - b.ori <- intersect(unlist(sapply(newbranch[[i]], function(k) names(mcl$clusterid)[mcl$clusterid == k])), cells) - sapply(seq(1, length(newbranch.pm)), function(j){ - print(j) - id <- which(sapply(paste0(names(ord.pm),','), function(k) grepl(paste0(paste0(newbranch.pm[[j]], collapse = ','),','), k)))[1] - cells <- ord.pm[[id]] - b.pm <- intersect(unlist(sapply(newbranch.pm[[j]], function(k) names(mcl.pm$clusterid)[mcl.pm$clusterid == k])), cells) - js <- length(intersect(b.pm, b.ori))/length(union(b.pm, b.ori)) - }) - }) - oc <- sapply(seq(1, length(newbranch)), function(i){ - id <- which(sapply(paste0(names(ord),','), function(k) grepl(paste0(paste0(newbranch[[i]], collapse = ','),','), k)))[1] - cells <- ord[[id]] - b.ori <- intersect(unlist(sapply(newbranch[[i]], function(k) names(mcl$clusterid)[mcl$clusterid == k])), cells) - sapply(seq(1, length(newbranch.pm)), function(j){ - id <- which(sapply(paste0(names(ord.pm),','), function(k) grepl(paste0(paste0(newbranch.pm[[j]], collapse = ','),','), k)))[1] - cells <- ord.pm[[id]] - b.pm <- intersect(unlist(sapply(newbranch.pm[[j]], function(k) names(mcl.pm$clusterid)[mcl.pm$clusterid == k])), cells) - oc <- length(intersect(b.pm, b.ori))/min(length(b.pm), length(b.ori)) - }) - }) - corr <- sapply(seq(1, length(newbranch)), function(i){ - id <- which(sapply(paste0(names(ord),','), function(k) grepl(paste0(paste0(newbranch[[i]], collapse = ','),','), k)))[1] - cells <- ord[[id]] - b.ori <- intersect(unlist(sapply(newbranch[[i]], function(k) names(mcl$clusterid)[mcl$clusterid == k])), cells) - - sapply(seq(1, length(newbranch.pm)), function(j){ - id <- which(sapply(paste0(names(ord.pm),','), function(k) grepl(paste0(paste0(newbranch.pm[[j]], collapse = ','),','), k)))[1] - cells <- ord.pm[[id]] - b.pm <- intersect(unlist(sapply(newbranch.pm[[j]], function(k) names(mcl.pm$clusterid)[mcl.pm$clusterid == k])), cells) - ov = intersect(b.ori, b.pm) - cor(pt[ov], pt.pm[ov]) - }) - }) - corr[is.na(corr)] <- 0 - colnames(corr) <- colnames(oc) <- colnames(js) <- paste0('original', seq(1, length(newbranch))) - jslist[[pmid]] <- js - oclist[[pmid]] <- oc - corrlist[[pmid]] <- corr - - ## get js binary to matched branches <<<<<<<<<<<<<<< - js.binary <- get_binary(js) - corr.score[[pmid]] <- corr * js.binary - js.melt <- melt(js.binary) - js.melt <- js.melt[js.melt[,3]!=0,] - colnames(js.melt) <- c('permutation.branch','original.branch','matched') - reproduce[[pmid]] <- as.character(js.melt[,2]) - ## >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> - ## - tmp <- js.melt[1,2] - ctcomp <- sapply(js.melt[,2], function(tmp){ - c <- names(clu)[clu %in% newbranch.pm[[tmp]]] - ctcomp <- rep(0, length(unique(alls))) - names(ctcomp) <- unique(alls) - ctcomp[names(table(alls[c]))] <- table(alls[c]) - }) - colnames(ctcomp) <- paste0('origin', js.melt[,2]) - ctcomp <- ctcomp/rowSums(ctcomp) - - - ctcomp.new <- matrix(0, nrow = length(unique(alls)), ncol = length(newbranch)) - colnames(ctcomp.new) <- paste0('origin', seq(1, length(newbranch))) - rownames(ctcomp.new) <- unique(alls) - ctcomp.new[rownames(ctcomp), colnames(ctcomp)] <- ctcomp - ctcomplist[[pmid]] <- t(ctcomp.new) - -} - -# saveRDS(jslist, '/Users/wenpinhou/Dropbox/trajectory_variability/tree_variability/result/pm_js.rds') -# saveRDS(oclist, '/Users/wenpinhou/Dropbox/trajectory_variability/tree_variability/result/pm_oc.rds') -# -# saveRDS(corrlist, '/Users/wenpinhou/Dropbox/trajectory_variability/tree_variability/result/pm_oc.rds') - -jsm <- do.call(rbind, jslist) -ocm <- do.call(rbind, oclist) -# par(mfrow = c(1,2)) -# hist(jsm) -# hist(ocm) - -## moved within boostrap -# reproduce <- corr.score <- list() -# for (i in seq(1, length(jslist))){ -# print(i) -# js <- jslist[[i]] -# js.binary <- get_binary(js) -# corr.score[[i]] <- corrlist[[i]] * js.binary -# js.melt <- melt(js.binary) -# js.melt <- js.melt[js.melt[,3]!=0,] -# colnames(js.melt) <- c('permutation.branch','original.branch','matched') -# reproduce[[i]] <- as.character(js.melt[,2]) -# } - -reproduce <- unlist(reproduce) - - -js.perc <- rep(0, length(newbranch)) -js.perc[as.numeric(names(table(reproduce)))] <- table(reproduce)/n.permute -names(js.perc) <- newbranch -# saveRDS(js.perc, '/Users/wenpinhou/Dropbox/trajectory_variability/tree_variability/result/js_percentage.rds') - -corr.score.m <- do.call(rbind, corr.score) -corr.score.v <- colSums(corr.score.m)/n.permute -names(corr.score.v) <- newbranch -# saveRDS(corr.score.v, '/Users/wenpinhou/Dropbox/trajectory_variability/tree_variability/result/corr_score.rds') - -res <- sapply(seq(1,length(oclist)), function(i){ - print(i) - oc <- oclist[[i]] - oc.binary <- sapply(seq(1,ncol(oc)), function(c){ - (oc[,c] > oc.cut[c]) + 0 - }) - while (length(which(rowSums(oc.binary) > 1)) > 0 | length(which(colSums(oc.binary) > 1)) > 0){ - dup.id <- which(rowSums(oc.binary) > 1) - if (length(dup.id) == 1){ - addid <- which.max(oc[dup.id, ]) - oc.binary[dup.id, ] <- 0 - oc.binary[dup.id, addid] <- 1 - } else if (length(dup.id) > 1) { - for (dup.i in dup.id){ - addid <- which.max(oc[dup.i, ]) - oc.binary[dup.i, ] <- 0 - oc.binary[dup.i, addid] <- 1 - } - } - dup.id <- which(colSums(oc.binary) > 1) - if (length(dup.id) == 1){ - addid <- which.max(oc[, dup.id]) - oc.binary[, dup.id] <- 0 - oc.binary[addid, dup.id] <- 1 - } else if (length(dup.id) > 1) { - for (dup.i in dup.id){ - addid <- which.max(oc[, dup.i]) - oc.binary[, dup.i] <- 0 - oc.binary[addid, dup.i] <- 1 - } - } - } - oc.melt <- melt(oc.binary) - oc.melt <- oc.melt[oc.melt[,3]!=0,] - as.character(oc.melt[,2]) -}) -res <- unlist(res) -oc.perc <- rep(0, length(newbranch)) -oc.perc[as.numeric(names(table(res)))] <- table(res)/n.permute -names(oc.perc) <- newbranch -sort((js.perc + oc.perc)/2) - -# saveRDS(oc.perc, '/Users/wenpinhou/Dropbox/trajectory_variability/tree_variability/result/oc_percentage.rds') - -detection.rate <- data.frame(detection.rate = (js.perc + oc.perc[names(js.perc)])/2, stringsAsFactors = FALSE) -sample.cellcomp.mean <- apply(simplify2array(ctcomplist), 1:2, mean) -sample.cellcomp.sd <- apply(simplify2array(ctcomplist), 1:2, sd) -rownames(sample.cellcomp.mean) <- newbranch[as.numeric(sub('origin', '', rownames(sample.cellcomp.mean)))] -rownames(sample.cellcomp.sd) <- newbranch[as.numeric(sub('origin', '', rownames(sample.cellcomp.sd)))] - -result <- list(detection.rate = detection.rate, - sample.cellcomp.mean = sample.cellcomp.mean, - sample.cellcomp.sd = sample.cellcomp.sd) - diff --git a/hca_bone_marrow_data_analysis/tree_variability/auto_pc_auto_nclu_module_3traj/code/04_try_to_build_module_v2.R b/hca_bone_marrow_data_analysis/tree_variability/auto_pc_auto_nclu_module_3traj/code/04_try_to_build_module_v2.R deleted file mode 100644 index 43d7f22..0000000 --- a/hca_bone_marrow_data_analysis/tree_variability/auto_pc_auto_nclu_module_3traj/code/04_try_to_build_module_v2.R +++ /dev/null @@ -1,409 +0,0 @@ -rm(list=ls()) -library(ggplot2) -library(Seurat) -library(reshape2) -library(TSCAN) -library(scattermore) -library(RColorBrewer) -suppressMessages(library(igraph)) -n.permute <- 3 -max.clunum <- 50 -# setwd("/Users/wenpinhou/Dropbox/trajectory_variability/hca/data/HCA/proc/integrate") -setwd("/Users/wenpinhou/Dropbox/trajectory_variability") -plotdir <- 'tree_variability/plot/' -# -------------------------------------------------------------- -# input: seurat integrated object including: -# umap, pca -# celltype: a dataframe, col 1 is cell name, col 2 is cell type -# origin: the origin cell type -# -------------------------------------------------------------- -# read in data -umap = readRDS('hca/data/HCA/proc/integrate/umap.rds') -pca <- as.matrix(umap@reductions$pca@cell.embeddings) -# ctlevel <- data.frame(ct=c('HSC','MPP','LMPP','CMP','CLP','GMP','MEP',"Bcell","CD4Tcell","CD8Tcell",'NKcell','Mono','Ery'),level=c(1,2,3,3,4,4,4,5,5,5,5,5,5),immunepath=c(1,1,1,0,1,0,0,1,1,1,1,0,0),monopath=c(1,1,1,1,0,1,0,0,0,0,0,1,0),erypath=c(1,1,0,1,0,0,1,0,0,0,0,0,1),stringsAsFactors = F) -str(pca) -a = readRDS('/Users/wenpinhou/Dropbox/trajectory_variability/hca/data/HCA/proc/ct/sc.rds') -ct = data.frame(cell = names(a), celltype = a, stringsAsFactors = FALSE) - -mykmeans <- function(matrix, number.cluster = NA, maxclunum = 20, seed = 12345){ - ## cluster the rows - set.seed(seed) - library(parallel) - if (is.na(number.cluster)){ - rss <- mclapply(1:maxclunum,function(clunum) { - set.seed(12345) - tmp <- kmeans(matrix,clunum,iter.max = 1000) - tmp$betweenss/tmp$totss - },mc.cores=20) - rss <- unlist(rss) - x <- 1:maxclunum - optclunum <- which.min(sapply(1:maxclunum, function(i) { - x2 <- pmax(0, x - i) - sum(lm(rss ~ x + x2)$residuals^2) ## check this - })) - print(optclunum) - clu <- kmeans(matrix,optclunum) - } else { - clu <- kmeans(matrix, number.cluster) - } - return(clu) -} -findbranch <- function(mst, order, origin){ - deg <- degree(mst) - vertex <- names(deg[which(deg > 2 | deg == 1)]) - if (!origin %in% vertex) vertex <- c(origin, vertex) - eg <- expand.grid(1:length(vertex), 1:length(vertex)) - eg <- eg[eg[,1]0] - - allbranch <- gsub('backbone ', '', gsub('branch: ', '', names(order))) - allbranch <- sapply(allbranch, function(i) strsplit(i, ',')[[1]]) - allbranch <- paste0(names(allbranch), collapse = ' ') - newbranch <-sapply(tmpbranch, function(i) { - tmp <- paste0(i, collapse = ',') - if (!grepl(tmp, allbranch)){ - rev(i) - } else { - i - } - }) - return(newbranch) -} -get_binary <- function(matrix, matrix.cut){ - ## match boostrap and origin branches. - ## matrix: #boostrap.branch * #origin.branch, values are js or oc - ## matrix.cut: js or oc null distribution cutoff - matrix.binary <- sapply(seq(1,ncol(matrix)), function(c){ - (matrix[,c] > matrix.cut[c]) + 0 - }) - while (length(which(rowSums(matrix.binary) > 1)) > 0 | length(which(colSums(matrix.binary) > 1)) > 0){ - dup.id <- which(rowSums(matrix.binary) > 1) - if (length(dup.id) == 1){ - addid <- which.max(matrix[dup.id, ]) - matrix.binary[dup.id, ] <- 0 - matrix.binary[dup.id, addid] <- 1 - } else if (length(dup.id) > 1) { - for (dup.i in dup.id){ - addid <- which.max(matrix[dup.i, ]) - matrix.binary[dup.i, ] <- 0 - matrix.binary[dup.i, addid] <- 1 - } - } - - dup.id <- which(colSums(matrix.binary) > 1) - if (length(dup.id) == 1){ - addid <- which.max(matrix[, dup.id]) - matrix.binary[, dup.id] <- 0 - matrix.binary[addid, dup.id] <- 1 - } else if (length(dup.id) > 1) { - for (dup.i in dup.id){ - addid <- which.max(matrix[, dup.i]) - matrix.binary[, dup.i] <- 0 - matrix.binary[addid, dup.i] <- 1 - } - } - } - return(matrix.binary) -} -infer_tree_structure <- function(pca, ct, origin.celltype, number.cluster = NA, plotdir = getwd()){ - alls <- sub(':.*', '', ct$cell) - names(alls) <- ct$cell - set.seed(12345) - sdev <- apply(pca, 2, sd) - x <- 1:max.clunum - optpoint <- which.min(sapply(2:max.clunum, function(i) { - x2 <- pmax(0, x - i) - sum(lm(sdev[1:max.clunum] ~ x + x2)$residuals^2) - })) - pcadim = optpoint + 1 - pr <- pca[,1:pcadim] # 7 - - ## clustering - # clu <- mykmeans(pr, number.cluster = number.cluster, maxclunum = 50, seed = i)$cluster - clu <- mykmeans(pr, maxclunum = 50, number.cluster = number.cluster)$cluster - table(clu) - pd = data.frame(x = pr[,1], y = pr[,2], clu = as.factor(clu[rownames(pr)])) - mypalette = colorRampPalette(brewer.pal(9,'Set1')) - pdf(paste0(plotdir, 'cluster.pdf'), width = 5, height = 4) - print(ggplot(data = pd, aes(x = x, y = y, color = clu)) + - geom_scattermore()+ - scale_color_manual(values = mypalette(14))+ - theme_classic() + xlab('PC1') + ylab('PC2')) - dev.off() - ## cell type composition in clusters - pd = cbind(pd, celltype = ct[match(rownames(pd), ct[,1]),2]) - tab <- table(pd[,3:4]) - tab <- tab/rowSums(tab) - pd <- melt(tab) - pd$clu <- factor(as.character(pd$clu), levels = seq(1,max(pd$clu))) - pdf(paste0(plotdir, 'celltype_composition_for_cluster.pdf'), width = 9, height = 5) - print(ggplot(data = pd) + - geom_bar(aes(x = clu, y = value, fill = celltype), stat = 'identity', position = 'dodge') + - theme_classic() + - ylab('Celltype Proportion') + - scale_fill_manual(values = mypalette(length(unique(pd$celltype))))) - dev.off() - ### mclust - mcl <- exprmclust(t(pr),cluster=clu,reduce=F) - # mcl <- exprmclust(t(pr), reduce = F) - pdf(paste0(plotdir, 'mcl.pdf'), width=8,height=8) - print(plotmclust(mcl, cell_point_size = 0.1)) - dev.off() - - # str(mcl) - # - # -------------------- - # construct pseudotime - # -------------------- - ## find origin - pd = data.frame(x = pr[,1], y = pr[,2], clu = as.factor(mcl$clusterid)) - pd = cbind(pd, celltype = ct[match(rownames(pd), ct[,1]),2]) - tab <- table(pd[,3:4]) - tab <- tab/rowSums(tab) - pd <- melt(tab) - pd$clu <- factor(as.character(pd$clu), levels = seq(1,max(pd$clu))) - tmp <- pd[pd$celltype == origin.celltype, ] - origin.cluster <- as.numeric(tmp[which.max(tmp[,3]), 1]) - - ## construct pseudotime - ord <- TSCANorder(mcl, startcluster = origin.cluster, listbranch = T,orderonly = T) - str(ord) - length(ord) - pt <- unlist(sapply(sapply(ord, length), function(i) seq(1, i))) - names(pt) <- unname(unlist(ord)) - - # ## plot pseudotime - pd = data.frame(pc1 = pca[,1], pc2 = pca[,2], time = as.numeric(pt[rownames(pca)])) - library(scattermore) - library(RColorBrewer) - pdf(paste0(plotdir, 'pseudotime.pdf'), width = 7, height = 6) - print(ggplot(data = pd, aes(x = pc1, y = pc2, color = time)) + - geom_scattermore() + - scale_color_gradient(low = 'yellow', high = 'blue')) - dev.off() - # ------------------------------------------------------------ - # get candidate branches to test reproducibility, 20200726 >> - # ------------------------------------------------------------ - - newbranch <- findbranch(mst = mcl$MSTtree, order = ord, origin = origin.cluster) - - # ----------------------------------------------------- - # Evaluate robustness of tree branches using resampling - # ----------------------------------------------------- - - # null distribution of Jaccard index, overlap coefficient - - js.null <- lapply(seq(1, length(newbranch)), function(i) { - b.ori <- unlist(sapply(newbranch[[i]], function(c) names(mcl$clusterid[mcl$clusterid == c]))) - tmp <- sapply(seq(1, 1e3), function(j){ - set.seed(j) - b.pm <- sample(rownames(pr), length(b.ori)) - length(intersect(b.pm, b.ori))/length(union(b.pm, b.ori)) - }) - }) - - # par(mfrow = c(2,ceiling(length(js.null)/2))) - # for (i in js.null) hist(i, xlab = 'js', main = '', breaks = 50) - - js.cut <- sapply(js.null, quantile, 0.99) - - oc.null <- lapply(seq(1, length(newbranch)), function(i){ - b.ori <- unlist(sapply(newbranch[[i]], function(c) names(mcl$clusterid[mcl$clusterid == c]))) - tmp <- sapply(seq(1, 1e3), function(j){ - set.seed(j) - b.pm <- sample(rownames(pr), length(b.ori)) - length(intersect(b.pm, b.ori))/min(length(b.pm), length(b.ori)) - }) - }) - # par(mfrow = c(2,ceiling(length(oc.null)/2))) - # for (i in oc.null) hist(i, xlab = 'oc', main = '', breaks = 50) - oc.cut <- sapply(oc.null, quantile, 0.99) - - mcl$pseudotime <- pt - mcl$branch <- newbranch - mcl$js.cut <- js.cut - mcl$oc.cut <- oc.cut - mcl$pca <- pr - mcl$order <- ord - mcl$allsample <- alls - return(mcl) -} -evaluate_uncertainty <- function(inferobj, n.permute){ - pr <- inferobj$pca - newbranch <- inferobj$branch - js.cut <- inferobj$js.cut - oc.cut <- inferobj$oc.cut - pt <- inferobj$pseudotime - ord <- inferobj$order - alls <- inferobj$allsample - ctcomplist <- reproduce.js <- reproduce.oc <- corr.score <- list() - for (pmid in seq(1, n.permute)){ - print(pmid) - ## boostrap cells - set.seed(pmid) - pr.pm <- pr[sample(rownames(pr), nrow(pr), replace = TRUE),] - pr.pm <- pr.pm[!duplicated(rownames(pr.pm)),] - - ## cluster cells - clu <- mykmeans(pr.pm, number.cluster = 14)$cluster ### - - # --- check if these codes are necessary <<<<<<<<<<<<<<<< - # pd = data.frame(x = pr[names(clu),1], y = pr[names(clu),2], clu = as.factor(clu)) - # pd.text.x = tapply(pd[,1], list(pd$clu), mean) - # pd.text.y = tapply(pd[,2], list(pd$clu), mean) - # pd.text = data.frame(x = pd.text.x, y = pd.text.y, clu = names(pd.text.x)) - # pd.text[14,1:2] = c(pd.text[14,1] + 2, pd.text[14,2] + 1) - # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> - # ggplot() + - # geom_scattermore(data = pd, aes(x = x, y = y, color = clu))+ - # scale_color_manual(values = mypalette(14))+ - # theme_classic() + xlab('UMAP1') + ylab('UMAP2') + - # geom_text(data = pd.text, aes(x = x, y = y, label = clu)) - - - ## cell type composition in clusters - # pd = cbind(pd, celltype = ct[match(rownames(pd), ct[,1]),2]) - # tab <- table(pd[,3:4]) - # tab <- tab/rowSums(tab) - # pd <- melt(tab) - # pd$clu <- factor(as.character(pd$clu), levels = seq(1,max(pd$clu))) - # ggplot(data = pd) + - # geom_bar(aes(x = clu, y = value, fill = celltype), stat = 'identity', position = 'dodge') + - # theme_classic() + - # ylab('Celltype Proportion') + - # scale_fill_manual(values = mypalette(length(unique(pd$celltype)))) - - ## build pseudotime - mcl.pm <- exprmclust(t(pr.pm), cluster = clu, reduce = FALSE) ### - # plotmclust(mcl.pm, cell_point_size = 0.1) - - ## select origin cluster - pt.pm.mean<- tapply(pt[names(mcl.pm[['clusterid']])], list(mcl.pm[['clusterid']]), mean) - start.cluster <- names(which.min(pt.pm.mean)) - - ## construct pseudotime - ord.pm <- TSCANorder(mcl.pm, startcluster = start.cluster, listbranch = T,orderonly = T) - # str(ord.pm) - - pt.pm <- unlist(sapply(sapply(ord.pm, length), function(i) seq(1, i))) - names(pt.pm) <- unname(unlist(ord.pm)) - # --- check if these codes are necessary <<<<<<<<<<<<<<<< - ## plot pseudotime - - pd = data.frame(pc1 = pca[,1], pc2 = pca[,2], time = as.numeric(pt.pm[rownames(pca)])) - # ggplot(data = pd, aes(x = pc1, y = pc2, color = time)) + - # geom_scattermore() + theme_classic() - # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> - - # get candidate branches - newbranch.pm <- findbranch(mst = mcl.pm$MSTtree, order = ord.pm, origin = start.cluster) - - ## compare two MST - js <- sapply(seq(1, length(newbranch)), function(i){ - id <- which(sapply(paste0(names(ord),','), function(k) grepl(paste0(paste0(newbranch[[i]], collapse = ','),','), k)))[1] - cells <- ord[[id]] - b.ori <- intersect(unlist(sapply(newbranch[[i]], function(k) names(inferobj$clusterid)[inferobj$clusterid == k])), cells) - sapply(seq(1, length(newbranch.pm)), function(j){ - - id <- which(sapply(paste0(names(ord.pm),','), function(k) grepl(paste0(paste0(newbranch.pm[[j]], collapse = ','),','), k)))[1] - cells <- ord.pm[[id]] - b.pm <- intersect(unlist(sapply(newbranch.pm[[j]], function(k) names(mcl.pm$clusterid)[mcl.pm$clusterid == k])), cells) - js <- length(intersect(b.pm, b.ori))/length(union(b.pm, b.ori)) - }) - }) - oc <- sapply(seq(1, length(newbranch)), function(i){ - id <- which(sapply(paste0(names(ord),','), function(k) grepl(paste0(paste0(newbranch[[i]], collapse = ','),','), k)))[1] - cells <- ord[[id]] - b.ori <- intersect(unlist(sapply(newbranch[[i]], function(k) names(inferobj$clusterid)[inferobj$clusterid == k])), cells) - sapply(seq(1, length(newbranch.pm)), function(j){ - id <- which(sapply(paste0(names(ord.pm),','), function(k) grepl(paste0(paste0(newbranch.pm[[j]], collapse = ','),','), k)))[1] - cells <- ord.pm[[id]] - b.pm <- intersect(unlist(sapply(newbranch.pm[[j]], function(k) names(mcl.pm$clusterid)[mcl.pm$clusterid == k])), cells) - oc <- length(intersect(b.pm, b.ori))/min(length(b.pm), length(b.ori)) - }) - }) - corr <- sapply(seq(1, length(newbranch)), function(i){ - id <- which(sapply(paste0(names(ord),','), function(k) grepl(paste0(paste0(newbranch[[i]], collapse = ','),','), k)))[1] - cells <- ord[[id]] - b.ori <- intersect(unlist(sapply(newbranch[[i]], function(k) names(inferobj$clusterid)[inferobj$clusterid == k])), cells) - - sapply(seq(1, length(newbranch.pm)), function(j){ - id <- which(sapply(paste0(names(ord.pm),','), function(k) grepl(paste0(paste0(newbranch.pm[[j]], collapse = ','),','), k)))[1] - cells <- ord.pm[[id]] - b.pm <- intersect(unlist(sapply(newbranch.pm[[j]], function(k) names(mcl.pm$clusterid)[mcl.pm$clusterid == k])), cells) - ov = intersect(b.ori, b.pm) - cor(pt[ov], pt.pm[ov]) - }) - }) - corr[is.na(corr)] <- 0 - colnames(corr) <- colnames(oc) <- colnames(js) <- paste0('original', seq(1, length(newbranch))) - - ## get js binary to match branches - js.binary <- get_binary(js, js.cut) - corr.score[[pmid]] <- corr * js.binary - js.melt <- melt(js.binary) - js.melt <- js.melt[js.melt[,3]!=0,] - colnames(js.melt) <- c('permutation.branch','original.branch','matched') - reproduce.js[[pmid]] <- as.character(js.melt[,2]) - - ## get oc binary to match branches - oc.binary <- get_binary(oc, oc.cut) - oc.melt <- melt(oc.binary) - oc.melt <- oc.melt[oc.melt[,3]!=0,] - reproduce.oc[[pmid]] <- as.character(oc.melt[,2]) - - ## samples cell compositions - ctcomp <- sapply(js.melt[,2], function(tmp){ - c <- names(clu)[clu %in% newbranch.pm[[tmp]]] - ctcomp <- rep(0, length(unique(alls))) - names(ctcomp) <- unique(alls) - ctcomp[names(table(alls[c]))] <- table(alls[c]) - }) - colnames(ctcomp) <- paste0('origin', js.melt[,2]) - ctcomp <- ctcomp/rowSums(ctcomp) - - ctcomp.new <- matrix(0, nrow = length(unique(alls)), ncol = length(newbranch)) - colnames(ctcomp.new) <- paste0('origin', seq(1, length(newbranch))) - rownames(ctcomp.new) <- unique(alls) - ctcomp.new[rownames(ctcomp), colnames(ctcomp)] <- ctcomp - ctcomplist[[pmid]] <- t(ctcomp.new) - } - - reproduce.js <- unlist(reproduce.js) - js.perc <- rep(0, length(newbranch)) - js.perc[as.numeric(names(table(reproduce.js)))] <- table(reproduce.js)/n.permute - names(js.perc) <- newbranch - - reproduce.oc <- unlist(reproduce.oc) - oc.perc <- rep(0, length(newbranch)) - oc.perc[as.numeric(names(table(reproduce.oc)))] <- table(reproduce.oc)/n.permute - names(oc.perc) <- newbranch - - corr.score.m <- do.call(rbind, corr.score) - corr.score.v <- colSums(corr.score.m)/n.permute - names(corr.score.v) <- newbranch - - sort((js.perc + oc.perc)/2) - - detection.rate <- data.frame(detection.rate = (js.perc + oc.perc[names(js.perc)])/2, stringsAsFactors = FALSE) - sample.cellcomp.mean <- apply(simplify2array(ctcomplist), 1:2, mean) - sample.cellcomp.sd <- apply(simplify2array(ctcomplist), 1:2, sd) - rownames(sample.cellcomp.mean) <- newbranch[as.numeric(sub('origin', '', rownames(sample.cellcomp.mean)))] - rownames(sample.cellcomp.sd) <- newbranch[as.numeric(sub('origin', '', rownames(sample.cellcomp.sd)))] - - result <- list(detection.rate = detection.rate, - sample.cellcomp.mean = sample.cellcomp.mean, - sample.cellcomp.sd = sample.cellcomp.sd) - return(result) -} - -# permutation -a = infer_tree_structure(pca = pca, ct = ct, origin.celltype = 'HSC', plotdir = plotdir) -result <- evaluate_uncertainty(a, 100) -saveRDS(result, '/Users/wenpinhou/Dropbox/trajectory_variability/tree_variability/result/result.rds') diff --git a/hca_bone_marrow_data_analysis/tree_variability/auto_pc_auto_nclu_module_3traj/code/05_try_to_build_module_v3.R b/hca_bone_marrow_data_analysis/tree_variability/auto_pc_auto_nclu_module_3traj/code/05_try_to_build_module_v3.R deleted file mode 100644 index ecfa77a..0000000 --- a/hca_bone_marrow_data_analysis/tree_variability/auto_pc_auto_nclu_module_3traj/code/05_try_to_build_module_v3.R +++ /dev/null @@ -1,39 +0,0 @@ -rm(list=ls()) -library(ggplot2) -library(Seurat) -library(reshape2) -library(TSCAN) -library(scattermore) -library(RColorBrewer) -suppressMessages(library(igraph)) -n.permute <- 3 -max.clunum <- 50 -source("/Users/wenpinhou/Dropbox/trajectory_variability/function/01_function.R") -plotdir <- '/Users/wenpinhou/Dropbox/trajectory_variability/tree_variability/auto_pc_auto_nclu_module/plot/' -rdir <- '/Users/wenpinhou/Dropbox/trajectory_variability/tree_variability/auto_pc_auto_nclu_module/result/' -# -------------------------------------------------------------- -# input: seurat integrated object including: -# low dim reduction: umap, pca, or phate -# celltype: a dataframe, col 1 is cell name, col 2 is cell type (at least for the cells with origin cell type), col 3 is sample name -# origin: the origin cell type -# -------------------------------------------------------------- -# read in data -umap = readRDS('/Users/wenpinhou/Dropbox/trajectory_variability/hca/data/HCA/proc/integrate/umap.rds') -pca <- as.matrix(umap@reductions$pca@cell.embeddings) -a = readRDS('/Users/wenpinhou/Dropbox/trajectory_variability/hca/data/HCA/proc/ct/sc.rds') -ct = data.frame(cell = names(a), celltype = a, sample = sapply(names(a), function(i) sub(':.*', '', i)), stringsAsFactors = FALSE) - -# permutation -a = infer_tree_structure(pca = pca, ct = ct, origin.celltype = 'HSC', plotdir = plotdir, xlab='Principal Component 1', ylab = 'Principal Component 2') -pdf(paste0(plotdir, 'mcl.pdf'), width=5.5,height=4.5) -print(plotmclust(a, cell_point_size = 0.1, x.lab = 'Pincipal Component 1', y.lab = 'Principal Component 2')) -dev.off() -result <- evaluate_uncertainty(a, 100) -saveRDS(result, paste0(rdir, 'result.rds')) - -for (i in names(result)){ - write.csv(result[[i]], paste0(rdir, i, '.csv'), row.names = T) -} - - - diff --git a/hca_bone_marrow_data_analysis/tree_variability/auto_pc_auto_nclu_module_3traj/code/06_build_module_all_rmall_rmBM1256.R b/hca_bone_marrow_data_analysis/tree_variability/auto_pc_auto_nclu_module_3traj/code/06_build_module_all_rmall_rmBM1256.R deleted file mode 100644 index 05771ad..0000000 --- a/hca_bone_marrow_data_analysis/tree_variability/auto_pc_auto_nclu_module_3traj/code/06_build_module_all_rmall_rmBM1256.R +++ /dev/null @@ -1,94 +0,0 @@ -rm(list=ls()) -library(ggplot2) -library(Seurat) -library(reshape2) -library(TSCAN) -library(scattermore) -library(RColorBrewer) -suppressMessages(library(igraph)) -n.permute <- 10000 -max.clunum <- 50 -source("/Users/wenpinhou/Dropbox/trajectory_variability/function/01_function.R") -plotdir <- '/Users/wenpinhou/Dropbox/trajectory_variability/tree_variability/auto_pc_auto_nclu_module_3traj/plot/' -rdir <- '/Users/wenpinhou/Dropbox/trajectory_variability/tree_variability/auto_pc_auto_nclu_module_3traj/result/' -# -------------------------------------------------------------- -# input: seurat integrated object including: -# low dim reduction: umap, pca, or phate -# celltype: a dataframe, col 1 is cell name, col 2 is cell type (at least for the cells with origin cell type), col 3 is sample name -# origin: the origin cell type -# -------------------------------------------------------------- -# read in data -umap = readRDS('/Users/wenpinhou/Dropbox/trajectory_variability/hca/data/HCA/proc/integrate/ser/umap.rds') -pca <- as.matrix(umap@reductions$pca@cell.embeddings) -a = readRDS('/Users/wenpinhou/Dropbox/trajectory_variability/hca/data/HCA/proc/ct/sc.rds') -ct = data.frame(cell = names(a), celltype = a, sample = sapply(names(a), function(i) sub(':.*', '', i)), stringsAsFactors = FALSE) - -# permutation -res = infer_tree_structure(pca = pca, ct = ct, origin.celltype = 'HSC', plotdir = plotdir, xlab='Principal Component 1', ylab = 'Principal Component 2', original = T) -saveRDS(res, paste0(rdir, 'infer_tree_structure_res.rds')) -png(paste0(plotdir, 'mcl.png'), width=900,height=800, res = 200) -plotmclust(res, cell_point_size = 0.1, x.lab = 'Principal Component 1', y.lab = 'Principal Component 2') -dev.off() -result <- evaluate_uncertainty(res, n.permute) -saveRDS(result, paste0(rdir, 'result.rds')) -for (i in names(result)){ - write.csv(result[[i]], paste0(rdir, i, '.csv'), row.names = T) -} - -## subsample cells, and then redo infer tree structure -# --------------- -# for all samples -# --------------- -for (rm.perc in seq(0.1, 0.8, 0.1)){ - print(rm.perc) - plotdir <- paste0('/Users/wenpinhou/Dropbox/trajectory_variability/tree_variability/rmall/', rm.perc, '/plot/') - rdir <- paste0('/Users/wenpinhou/Dropbox/trajectory_variability/tree_variability/rmall/', rm.perc, '/result/') - dir.create(plotdir, recursive = T) - dir.create(rdir, recursive = T) - - selectcell = res$order[[2]] ## get branch 5,1 cells - set.seed(12345) - rmcell = sample(selectcell, rm.perc*length(selectcell)) - subset.cell = setdiff(rownames(pca), rmcell) ## remove a percentage of cells from branch 5,1 - - pdf(paste0(plotdir, 'mcl.pdf'), width=6,height=5) - print(plotmclust(res, cell_point_size = 0.1, x.lab = 'PC1', y.lab = 'PC2', subset.cell = subset.cell)) - dev.off() - result <- evaluate_uncertainty(res, n.permute, subset.cell = subset.cell) - saveRDS(result, paste0(rdir, 'result.rds')) - for (i in names(result)){ - write.csv(result[[i]], paste0(rdir, i, '.csv'), row.names = T) - } -} - - -# ---------------------------- -# for some samples: BM1,2,5,6 -# ---------------------------- -for (rm.perc in seq(0.1, 0.8, 0.1)){ - print(rm.perc) - plotdir <- paste0('/Users/wenpinhou/Dropbox/trajectory_variability/tree_variability/rmBM1256/', rm.perc, '/plot/') - rdir <- paste0('/Users/wenpinhou/Dropbox/trajectory_variability/tree_variability/rmBM1256/', rm.perc, '/result/') - dir.create(plotdir, recursive = T) - dir.create(rdir, recursive = T) - - selectcell = res$order[[2]] - selectcell = selectcell[ct[selectcell, 'sample'] %in% c('BM1', 'BM2', 'BM5', 'BM6')] - set.seed(12345) - rmcell = sample(selectcell, rm.perc*length(selectcell)) - subset.cell = setdiff(rownames(pca), rmcell) - - pdf(paste0(plotdir, 'mcl.pdf'), width=6,height=5) - print(plotmclust(res, cell_point_size = 0.1, x.lab = 'PC1', y.lab = 'PC2', subset.cell = subset.cell)) - dev.off() - result <- evaluate_uncertainty(res, n.permute, subset.cell = subset.cell) - saveRDS(result, paste0(rdir, 'result.rds')) - for (i in names(result)){ - write.csv(result[[i]], paste0(rdir, i, '.csv'), row.names = T) - } -} - - - - - diff --git a/hca_bone_marrow_data_analysis/tree_variability/code/01_reproducibility.R b/hca_bone_marrow_data_analysis/tree_variability/code/01_reproducibility.R deleted file mode 100644 index 6a0319e..0000000 --- a/hca_bone_marrow_data_analysis/tree_variability/code/01_reproducibility.R +++ /dev/null @@ -1,274 +0,0 @@ -rm(list=ls()) -library(ggplot2) -library(Seurat) -library(reshape2) -library(TSCAN) -library(scattermore) -library(RColorBrewer) -suppressMessages(library(igraph)) -n.permute <- 1e3 -setwd("/Users/wenpinhou/Dropbox/trajectory_variability/hca/data/HCA/proc/integrate") -umap = readRDS('umap.rds') -pca <- as.matrix(umap@reductions$pca@cell.embeddings) -ctlevel <- data.frame(ct=c('HSC','MPP','LMPP','CMP','CLP','GMP','MEP',"Bcell","CD4Tcell","CD8Tcell",'NKcell','Mono','Ery'),level=c(1,2,3,3,4,4,4,5,5,5,5,5,5),immunepath=c(1,1,1,0,1,0,0,1,1,1,1,0,0),monopath=c(1,1,1,1,0,1,0,0,0,0,0,1,0),erypath=c(1,1,0,1,0,0,1,0,0,0,0,0,1),stringsAsFactors = F) -str(pca) -a = readRDS('/Users/wenpinhou/Dropbox/trajectory_variability/hca/data/HCA/proc/ct/sc.rds') -ct = data.frame(cell = names(a), celltype = a, stringsAsFactors = FALSE) - -mykmeans <- function(matrix, number.cluster = NA){ - ## cluster the rows - set.seed(12345) - library(parallel) - if (is.na(number.cluster)){ - maxclunum <- 20 - rss <- mclapply(1:maxclunum,function(clunum) { - tmp <- kmeans(matrix,clunum,iter.max = 1000) - tmp$betweenss/tmp$totss - },mc.cores=20) - rss <- unlist(rss) - x <- 1:maxclunum - optclunum <- which.min(sapply(1:maxclunum, function(i) { - x2 <- pmax(0, x - i) - sum(lm(rss ~ x + x2)$residuals^2) ## check this - })) - clu <- kmeans(matrix,optclunum) - } else { - clu <- kmeans(matrix, number.cluster) - } - return(clu) -} - -### determine numPC -set.seed(12345) -sdev <- apply(pca, 2, sd) -x <- 1:20 -optpoint <- which.min(sapply(2:20, function(i) { - x2 <- pmax(0, x - i) - sum(lm(sdev[1:20] ~ x + x2)$residuals^2) -})) -pcadim = optpoint + 1 -pr <- pca[,1:pcadim] # 2 - -### mclust -# mcl <- exprmclust(t(pr),cluster=clu,reduce=F) -mcl <- exprmclust(t(pr), reduce = F) -plotmclust(mcl, cell_point_size = 0.1) -str(mcl) -## find origin -pd = data.frame(x = pr[,1], y = pr[,2], clu = as.factor(mcl$clusterid)) -pd = cbind(pd, celltype = ct[match(rownames(pd), ct[,1]),2]) -tab <- table(pd[,3:4]) -tab <- tab/rowSums(tab) -pd <- melt(tab) -pd$clu <- factor(as.character(pd$clu), levels = seq(1,max(pd$clu))) - -tmp <- pd[pd$celltype == 'HSC', ] -origin.cluster <- as.numeric(tmp[which.max(tmp[,3]), 1]) - -## construct pseudotime -ord <- TSCANorder(mcl, startcluster = origin.cluster, listbranch = T,orderonly = T) -str(ord) -length(ord) -pt <- unlist(sapply(sapply(ord, length), function(i) seq(1, i))) -names(pt) <- unname(unlist(ord)) - - -## plot pseudotime -pd = data.frame(pc1 = pca[,1], pc2 = pca[,2], time = as.numeric(pt[rownames(pca)])) -library(scattermore) -library(RColorBrewer) -ggplot(data = pd, aes(x = pc1, y = pc2, color = time)) + - geom_scattermore() + - scale_color_gradient(low = 'yellow', high = 'blue') - -# ------------------------------------------------------- -# null distribution of Jaccard index, overlap coefficient -# ------------------------------------------------------- -js.null <- lapply(seq(1, length(ord)), function(i){ - b.ori <- ord[[i]] - tmp <- sapply(seq(1, 1e3), function(j){ - set.seed(j) - b.pm <- sample(rownames(pr), length(b.ori)) - length(intersect(b.pm, b.ori))/length(union(b.pm, b.ori)) - }) -}) - -par(mfrow = c(1,3)) -hist(js.null[[1]]) -hist(js.null[[2]]) -hist(js.null[[3]]) - -js.cut <- sapply(js.null, quantile, 0.99) - -oc.null <- lapply(seq(1, length(ord)), function(i){ - b.ori <- ord[[i]] - tmp <- sapply(seq(1, 1e3), function(j){ - set.seed(j) - b.pm <- sample(rownames(pr), length(b.ori)) - length(intersect(b.pm, b.ori))/min(length(b.pm), length(b.ori)) - }) -}) -par(mfrow = c(1,3)) -hist(oc.null[[1]]) -hist(oc.null[[2]]) -hist(oc.null[[3]]) - -oc.cut <- sapply(oc.null, quantile, 0.99) - -# ----------- -# permutation -# ----------- -corrlist <- jslist <- oclist <- list() -for (pmid in seq(1, n.permute)){ - ## boostrap cells - print(pmid) - set.seed(pmid) - pr.pm <- pr[sample(rownames(pr), nrow(pr), replace = TRUE),] - pr.pm <- pr.pm[!duplicated(rownames(pr.pm)),] - - # ## cluster cells - mcl.pm <- exprmclust(t(pr.pm), reduce = FALSE) ### - # plotmclust(mcl.pm, cell_point_size = 0.1) - - ## select origin cluster - pt.pm.mean<- tapply(pt[names(mcl.pm[['clusterid']])], list(mcl.pm[['clusterid']]), mean) - start.cluster <- names(which.min(pt.pm.mean)) - - ## construct pseudotime - ord.pm <- TSCANorder(mcl.pm, startcluster = start.cluster, listbranch = T,orderonly = T) - # str(ord.pm) - - ## plot pseudotime - pt.pm <- unlist(sapply(sapply(ord.pm, length), function(i) seq(1, i))) - names(pt.pm) <- unname(unlist(ord.pm)) - pd = data.frame(pc1 = pca[,1], pc2 = pca[,2], time = as.numeric(pt.pm[rownames(pca)])) - # ggplot(data = pd, aes(x = pc1, y = pc2, color = time)) + - # geom_scattermore() - - ## compare two MST - js <- sapply(seq(1, length(ord)), function(i){ - sapply(seq(1, length(ord.pm)), function(j){ - b.ori <- ord[[i]] - b.pm <- ord.pm[[j]] - js <- length(intersect(b.pm, b.ori))/length(union(b.pm, b.ori)) - }) - }) - oc <- sapply(seq(1, length(ord)), function(i){ - sapply(seq(1, length(ord.pm)), function(j){ - b.ori <- ord[[i]] - b.pm <- ord.pm[[j]] - oc <- length(intersect(b.pm, b.ori))/min(length(b.pm), length(b.ori)) - }) - }) - corr <- sapply(seq(1, length(ord)), function(i){ - sapply(seq(1, length(ord.pm)), function(j){ - ov = intersect(ord[[i]], ord.pm[[j]]) - cor(pt[ov], pt.pm[ov]) - }) - }) - corr[is.na(corr)] <- 0 - colnames(corr) <- colnames(oc) <- colnames(js) <- paste0('original', seq(1, length(ord))) - jslist[[pmid]] <- js - oclist[[pmid]] <- oc - corrlist[[pmid]] <- corr -} -saveRDS(jslist, '/Users/wenpinhou/Dropbox/trajectory_variability/tree_variability/result/pm_js.rds') -saveRDS(oclist, '/Users/wenpinhou/Dropbox/trajectory_variability/tree_variability/result/pm_oc.rds') - -saveRDS(corrlist, '/Users/wenpinhou/Dropbox/trajectory_variability/tree_variability/result/pm_oc.rds') - -jsm <- do.call(rbind, jslist) -ocm <- do.call(rbind, oclist) -par(mfrow = c(1,2)) -hist(jsm) -hist(ocm) - -res <- corr.score <- list() -for (i in seq(1, length(jslist))){ - js <- jslist[[i]] - js.binary <- sapply(seq(1,ncol(js)), function(c){ - (js[,c] > js.cut[c]) + 0 - }) - while (length(which(rowSums(js.binary) > 1)) > 0 | length(which(colSums(js.binary) > 1)) > 0){ - dup.id <- which(rowSums(js.binary) > 1) - if (length(dup.id) == 1){ - addid <- which.max(js[dup.id, ]) - js.binary[dup.id, ] <- 0 - js.binary[dup.id, addid] <- 1 - } else if (length(dup.id) > 1) { - for (dup.i in dup.id){ - addid <- which.max(js[dup.i, ]) - js.binary[dup.i, ] <- 0 - js.binary[dup.i, addid] <- 1 - } - } - - dup.id <- which(colSums(js.binary) > 1) - if (length(dup.id) == 1){ - addid <- which.max(js[, dup.id]) - js.binary[dup.id, ] <- 0 - js.binary[addid, dup.id] <- 1 - } else if (length(dup.id) > 1) { - for (dup.i in dup.id){ - addid <- which.max(js[, dup.id]) - js.binary[, dup.id] <- 0 - js.binary[addid, dup.id] <- 1 - } - } - } - - - corr.score[[i]] <- corrlist[[i]] * js.binary - js.melt <- melt(js.binary) - js.melt <- js.melt[js.melt[,3]!=0,] - res[[i]] <- as.character(js.melt[,2]) -} -res <- unlist(res) -js.perc <- table(res)/n.permute -saveRDS(js.perc, '/Users/wenpinhou/Dropbox/trajectory_variability/tree_variability/result/js_percentage.rds') - -corr.score.m <- do.call(rbind, corr.score) -corr.score.v <- colSums(corr.score.m)/n.permute -saveRDS(corr.score.v, '/Users/wenpinhou/Dropbox/trajectory_variability/tree_variability/result/corr_score.rds') - -res <- sapply(seq(1,length(oclist)), function(i){ - oc <- oclist[[i]] - oc.binary <- sapply(seq(1,ncol(oc)), function(c){ - (oc[,c] > oc.cut[c]) + 0 - }) - while (length(which(rowSums(oc.binary) > 1)) > 0 | length(which(colSums(oc.binary) > 1)) > 0){ - dup.id <- which(rowSums(oc.binary) > 1) - if (length(dup.id) == 1){ - addid <- which.max(oc[dup.id, ]) - oc.binary[dup.id, ] <- 0 - oc.binary[dup.id, addid] <- 1 - } else if (length(dup.id) > 1) { - for (dup.i in dup.id){ - addid <- which.max(oc[dup.i, ]) - oc.binary[dup.i, ] <- 0 - oc.binary[dup.i, addid] <- 1 - } - } - dup.id <- which(colSums(oc.binary) > 1) - if (length(dup.id) == 1){ - addid <- which.max(oc[, dup.id]) - oc.binary[, dup.id] <- 0 - oc.binary[addid, dup.id] <- 1 - } else if (length(dup.id) > 1) { - for (dup.i in dup.id){ - addid <- which.max(oc[, dup.i]) - oc.binary[, dup.i] <- 0 - oc.binary[addid, dup.i] <- 1 - } - } - } - oc.melt <- melt(oc.binary) - oc.melt <- oc.melt[oc.melt[,3]!=0,] - as.character(oc.melt[,2]) -}) -res <- unlist(res) -oc.perc <- table(res)/n.permute -sort((js.perc + oc.perc)/2) -saveRDS(oc.perc, '/Users/wenpinhou/Dropbox/trajectory_variability/tree_variability/result/oc_percentage.rds') - - diff --git a/hca_bone_marrow_data_analysis/tree_variability/code/02_samples_reproducibility.R b/hca_bone_marrow_data_analysis/tree_variability/code/02_samples_reproducibility.R deleted file mode 100644 index 82056b3..0000000 --- a/hca_bone_marrow_data_analysis/tree_variability/code/02_samples_reproducibility.R +++ /dev/null @@ -1,452 +0,0 @@ -rm(list=ls()) -library(ggplot2) -library(Seurat) -library(reshape2) -library(TSCAN) -library(scattermore) -library(RColorBrewer) -suppressMessages(library(igraph)) -n.permute <- 1e3 -max.clunum <- 50 -setwd("/Users/wenpinhou/Dropbox/trajectory_variability/hca/data/HCA/proc/integrate") -umap = readRDS('umap.rds') -pca <- as.matrix(umap@reductions$pca@cell.embeddings) -ctlevel <- data.frame(ct=c('HSC','MPP','LMPP','CMP','CLP','GMP','MEP',"Bcell","CD4Tcell","CD8Tcell",'NKcell','Mono','Ery'),level=c(1,2,3,3,4,4,4,5,5,5,5,5,5),immunepath=c(1,1,1,0,1,0,0,1,1,1,1,0,0),monopath=c(1,1,1,1,0,1,0,0,0,0,0,1,0),erypath=c(1,1,0,1,0,0,1,0,0,0,0,0,1),stringsAsFactors = F) -str(pca) -a = readRDS('/Users/wenpinhou/Dropbox/trajectory_variability/hca/data/HCA/proc/ct/sc.rds') -ct = data.frame(cell = names(a), celltype = a, stringsAsFactors = FALSE) - -mykmeans <- function(matrix, number.cluster = NA){ - ## cluster the rows - set.seed(12345) - library(parallel) - if (is.na(number.cluster)){ - maxclunum <- 20 - rss <- mclapply(1:maxclunum,function(clunum) { - tmp <- kmeans(matrix,clunum,iter.max = 1000) - tmp$betweenss/tmp$totss - },mc.cores=20) - rss <- unlist(rss) - x <- 1:maxclunum - optclunum <- which.min(sapply(1:maxclunum, function(i) { - x2 <- pmax(0, x - i) - sum(lm(rss ~ x + x2)$residuals^2) ## check this - })) - clu <- kmeans(matrix,optclunum) - } else { - clu <- kmeans(matrix, number.cluster) - } - return(clu) -} - -### determine numPC -set.seed(12345) -sdev <- apply(pca, 2, sd) -x <- 1:max.clunum -optpoint <- which.min(sapply(2:max.clunum, function(i) { - x2 <- pmax(0, x - i) - sum(lm(sdev[1:max.clunum] ~ x + x2)$residuals^2) -})) -pcadim = optpoint + 1 -pr <- pca[,1:pcadim] # 7 - -## clustering -clu <- mykmeans(pr, number.cluster = 14)$cluster -pd = data.frame(x = pr[,1], y = pr[,2], clu = as.factor(clu[rownames(pr)])) - -# mypalette = colorRampPalette(brewer.pal(9,'Set1')) -# ggplot(data = pd, aes(x = x, y = y, color = clu)) + -# geom_scattermore()+ -# scale_color_manual(values = mypalette(14))+ -# theme_classic() + xlab('UMAP1') + ylab('UMAP2') - -## cell type composition in clusters -pd = cbind(pd, celltype = ct[match(rownames(pd), ct[,1]),2]) -tab <- table(pd[,3:4]) -tab <- tab/rowSums(tab) -pd <- melt(tab) -pd$clu <- factor(as.character(pd$clu), levels = seq(1,max(pd$clu))) - -# ggplot(data = pd) + -# geom_bar(aes(x = clu, y = value, fill = celltype), stat = 'identity', position = 'dodge') + -# theme_classic() + -# ylab('Celltype Proportion') + -# scale_fill_manual(values = mypalette(length(unique(pd$celltype)))) - -### mclust -mcl <- exprmclust(t(pr),cluster=clu,reduce=F) -# mcl <- exprmclust(t(pr), reduce = F) -# plotmclust(mcl, cell_point_size = 0.1) -# str(mcl) - -# -------------------- -# construct pseudotime -# -------------------- -## find origin -pd = data.frame(x = pr[,1], y = pr[,2], clu = as.factor(mcl$clusterid)) -pd = cbind(pd, celltype = ct[match(rownames(pd), ct[,1]),2]) -tab <- table(pd[,3:4]) -tab <- tab/rowSums(tab) -pd <- melt(tab) -pd$clu <- factor(as.character(pd$clu), levels = seq(1,max(pd$clu))) -tmp <- pd[pd$celltype == 'HSC', ] -origin.cluster <- as.numeric(tmp[which.max(tmp[,3]), 1]) - -## construct pseudotime -ord <- TSCANorder(mcl, startcluster = origin.cluster, listbranch = T,orderonly = T) -str(ord) -length(ord) -pt <- unlist(sapply(sapply(ord, length), function(i) seq(1, i))) -names(pt) <- unname(unlist(ord)) - -# ## plot pseudotime -# pd = data.frame(pc1 = pca[,1], pc2 = pca[,2], time = as.numeric(pt[rownames(pca)])) -# library(scattermore) -# library(RColorBrewer) -# ggplot(data = pd, aes(x = pc1, y = pc2, color = time)) + -# geom_scattermore() + -# scale_color_gradient(low = 'yellow', high = 'blue') - -# ------------------------------------------------------------ -# get candidate branches to test reproducibility, 20200726 >> -# ------------------------------------------------------------ -findbranch <- function(mst, order, origin){ - deg <- degree(mst) - vertex <- names(deg[which(deg > 2 | deg == 1)]) - if (!origin %in% vertex) vertex <- c(origin, vertex) - eg <- expand.grid(1:length(vertex), 1:length(vertex)) - eg <- eg[eg[,1]0] - - allbranch <- gsub('backbone ', '', gsub('branch: ', '', names(order))) - allbranch <- sapply(allbranch, function(i) strsplit(i, ',')[[1]]) - allbranch <- paste0(names(allbranch), collapse = ' ') - newbranch <-sapply(tmpbranch, function(i) { - tmp <- paste0(i, collapse = ',') - if (!grepl(tmp, allbranch)){ - rev(i) - } else { - i - } - }) - return(newbranch) -} -newbranch <- findbranch(mst = mcl$MSTtree, order = ord, origin = origin.cluster) - - -# ------------------------------------------------------- -# null distribution of Jaccard index, overlap coefficient -# ------------------------------------------------------- -## add here --------------->>>>>> -## for samples -## add here ---------------<<<<<<< -js.null <- lapply(seq(1, length(newbranch)), function(i){ - b.ori <- unlist(sapply(newbranch[[i]], function(c) names(mcl$clusterid[mcl$clusterid == c]))) - b.ori.alls <- gsub(':.*', '', b.ori) - alls <- gsub(':.*', '', rownames(pr)) - tmp <- mclapply(seq(1, 1e3), function(j){ - set.seed(j) - b.pm <- sample(rownames(pr), length(b.ori)) - b.pm.alls <- gsub(':.*', '', b.pm) - tmpp <- sapply(unique(alls), function(s){ - b.pm.s <- b.pm[b.pm.alls == s] - b.ori.s <- b.ori[b.ori.alls == s] - length(intersect(b.pm.s, b.ori.s))/length(union(b.pm.s, b.ori.s)) - }) - },mc.cores = detectCores()-2) - tmp <- do.call(rbind,tmp) -}) -js.cut <- sapply(js.null, function(i) apply(i, 2, quantile, 0.99)) -# ------------------ - -oc.null <- lapply(seq(1, length(newbranch)), function(i){ - b.ori <- unlist(sapply(newbranch[[i]], function(c) names(mcl$clusterid[mcl$clusterid == c]))) - b.ori.alls <- gsub(':.*', '', b.ori) - tmp <- mclapply(seq(1, 1e3), function(j){ - set.seed(j) - b.pm <- sample(rownames(pr), length(b.ori)) - b.pm.alls <- gsub(':.*', '', b.pm) - tmpp <- sapply(unique(alls), function(s){ - b.pm.s <- b.pm[b.pm.alls == s] - b.ori.s <- b.ori[b.ori.alls == s] - length(intersect(b.pm.s, b.ori.s))/min(length(b.pm.s), length(b.ori.s)) - }) - },mc.cores = detectCores()-2) - tmp <- do.call(rbind,tmp) -}) -oc.cut <- sapply(oc.null, function(i) apply(i, 2, quantile, 0.99)) - -# ----------- -# permutation -# ----------- -corrlist.alls <- jslist.alls <- oclist.alls <- list() -n.permute = 100 -for (pmid in seq(1, n.permute)){ - ## boostrap cells - print(pmid) - set.seed(pmid) - pr.pm <- pr[sample(rownames(pr), nrow(pr), replace = TRUE),] - pr.pm <- pr.pm[!duplicated(rownames(pr.pm)),] - - # ## cluster cells - clu <- mykmeans(pr.pm, number.cluster = 14)$cluster ### - - pd = data.frame(x = pr[names(clu),1], y = pr[names(clu),2], clu = as.factor(clu)) - pd.text.x = tapply(pd[,1], list(pd$clu), mean) - pd.text.y = tapply(pd[,2], list(pd$clu), mean) - pd.text = data.frame(x = pd.text.x, y = pd.text.y, clu = names(pd.text.x)) - pd.text[14,1:2] = c(pd.text[14,1] + 2, pd.text[14,2] + 1) - - # ggplot() + - # geom_scattermore(data = pd, aes(x = x, y = y, color = clu))+ - # scale_color_manual(values = mypalette(14))+ - # theme_classic() + xlab('UMAP1') + ylab('UMAP2') + - # geom_text(data = pd.text, aes(x = x, y = y, label = clu)) - - - ## cell type composition in clusters - # pd = cbind(pd, celltype = ct[match(rownames(pd), ct[,1]),2]) - # tab <- table(pd[,3:4]) - # tab <- tab/rowSums(tab) - # pd <- melt(tab) - # pd$clu <- factor(as.character(pd$clu), levels = seq(1,max(pd$clu))) - # ggplot(data = pd) + - # geom_bar(aes(x = clu, y = value, fill = celltype), stat = 'identity', position = 'dodge') + - # theme_classic() + - # ylab('Celltype Proportion') + - # scale_fill_manual(values = mypalette(length(unique(pd$celltype)))) - - # build pseudotime - mcl.pm <- exprmclust(t(pr.pm), cluster = clu, reduce = FALSE) ### - # plotmclust(mcl.pm, cell_point_size = 0.1) - - ## select origin cluster - pt.pm.mean<- tapply(pt[names(mcl.pm[['clusterid']])], list(mcl.pm[['clusterid']]), mean) - start.cluster <- names(which.min(pt.pm.mean)) - - ## construct pseudotime - ord.pm <- TSCANorder(mcl.pm, startcluster = start.cluster, listbranch = T,orderonly = T) - # str(ord.pm) - - ## plot pseudotime - pt.pm <- unlist(sapply(sapply(ord.pm, length), function(i) seq(1, i))) - names(pt.pm) <- unname(unlist(ord.pm)) - pd = data.frame(pc1 = pca[,1], pc2 = pca[,2], time = as.numeric(pt.pm[rownames(pca)])) - # ggplot(data = pd, aes(x = pc1, y = pc2, color = time)) + - # geom_scattermore() + theme_classic() - - # get candidate branches - newbranch.pm <- findbranch(mst = mcl.pm$MSTtree, order = ord.pm, origin = start.cluster) - - ## compare two MST - js <- sapply(seq(1, length(newbranch)), function(i){ - print('i') - print(i) - id <- which(sapply(paste0(names(ord),','), function(k) grepl(paste0(paste0(newbranch[[i]], collapse = ','),','), k)))[1] - cells <- ord[[id]] - b.ori <- intersect(unlist(sapply(newbranch[[i]], function(k) names(mcl$clusterid)[mcl$clusterid == k])), cells) - b.ori.alls <- gsub(':.*', '', b.ori) - tmp <- mclapply(seq(1, length(newbranch.pm)), function(j){ - print(j) - id <- which(sapply(paste0(names(ord.pm),','), function(k) grepl(paste0(paste0(newbranch.pm[[j]], collapse = ','),','), k)))[1] - cells <- ord.pm[[id]] - b.pm <- intersect(unlist(sapply(newbranch.pm[[j]], function(k) names(mcl.pm$clusterid)[mcl.pm$clusterid == k])), cells) - b.pm.alls <- gsub(':.*', '', b.pm) - # js <- length(intersect(b.pm, b.ori))/length(union(b.pm, b.ori)) - tmpp <- sapply(unique(alls), function(s){ - b.pm.s <- b.pm[b.pm.alls == s] - b.ori.s <- b.ori[b.ori.alls == s] - length(intersect(b.pm.s, b.ori.s))/length(union(b.pm.s, b.ori.s)) - }) - },mc.cores = detectCores()-2) - tmp <- do.call(rbind,tmp) - rownames(tmp) <- paste0('branch.pm', seq(1, length(newbranch.pm))) - tmp - }, simplify = FALSE) - names(js) <- paste0('branch', seq(1, length(newbranch))) - ###### ===================================== - oc <- sapply(seq(1, length(newbranch)), function(i){ - id <- which(sapply(paste0(names(ord),','), function(k) grepl(paste0(paste0(newbranch[[i]], collapse = ','),','), k)))[1] - cells <- ord[[id]] - b.ori <- intersect(unlist(sapply(newbranch[[i]], function(k) names(mcl$clusterid)[mcl$clusterid == k])), cells) - b.ori.alls <- gsub(':.*', '', b.ori) - tmp <- mclapply(seq(1, length(newbranch.pm)), function(j){ - id <- which(sapply(paste0(names(ord.pm),','), function(k) grepl(paste0(paste0(newbranch.pm[[j]], collapse = ','),','), k)))[1] - cells <- ord.pm[[id]] - b.pm <- intersect(unlist(sapply(newbranch.pm[[j]], function(k) names(mcl.pm$clusterid)[mcl.pm$clusterid == k])), cells) - b.pm.alls <- gsub(':.*', '', b.pm) - # oc <- length(intersect(b.pm, b.ori))/min(length(b.pm), length(b.ori)) - tmpp <- sapply(unique(alls), function(s){ - b.pm.s <- b.pm[b.pm.alls == s] - b.ori.s <- b.ori[b.ori.alls == s] - length(intersect(b.pm.s, b.ori.s))/min(length(b.pm.s), length(b.ori.s)) - }) - },mc.cores = detectCores()-2) - tmp <- do.call(rbind,tmp) - rownames(tmp) <- paste0('branch.pm', seq(1, length(newbranch.pm))) - tmp - }, simplify = FALSE) - names(oc) <- paste0('branch', seq(1, length(newbranch))) - - - corr <- sapply(seq(1, length(newbranch)), function(i){ - id <- which(sapply(paste0(names(ord),','), function(k) grepl(paste0(paste0(newbranch[[i]], collapse = ','),','), k)))[1] - cells <- ord[[id]] - b.ori <- intersect(unlist(sapply(newbranch[[i]], function(k) names(mcl$clusterid)[mcl$clusterid == k])), cells) - b.ori.alls <- gsub(':.*', '', b.ori) - tmp <- mclapply(seq(1, length(newbranch.pm)), function(j){ - id <- which(sapply(paste0(names(ord.pm),','), function(k) grepl(paste0(paste0(newbranch.pm[[j]], collapse = ','),','), k)))[1] - cells <- ord.pm[[id]] - b.pm <- intersect(unlist(sapply(newbranch.pm[[j]], function(k) names(mcl.pm$clusterid)[mcl.pm$clusterid == k])), cells) - b.pm.alls <- gsub(':.*', '', b.pm) - # ov = intersect(b.ori, b.pm) - # cor(pt[ov], pt.pm[ov]) - tmpp <- sapply(unique(alls), function(s){ - b.pm.s <- b.pm[b.pm.alls == s] - b.ori.s <- b.ori[b.ori.alls == s] - ov = intersect(b.ori.s, b.pm.s) - cor(pt[ov], pt.pm[ov]) - }) - }, mc.cores = detectCores()-2) - tmp <- do.call(rbind, tmp) - rownames(tmp) <- paste0('branch.pm', seq(1, length(newbranch.pm))) - tmp[is.na(tmp)] <- 0 - tmp - }, simplify = FALSE) - # corr[is.na(corr)] <- 0 - names(corr) <- paste0('branch', seq(1, length(newbranch))) - # colnames(corr) <- colnames(oc) <- colnames(js) <- paste0('original', seq(1, length(newbranch))) - jslist.alls[[pmid]] <- js - oclist.alls[[pmid]] <- oc - corrlist.alls[[pmid]] <- corr -} -saveRDS(jslist.alls, '/Users/wenpinhou/Dropbox/trajectory_variability/tree_variability/result/samples/pm_js_alls.rds') -saveRDS(oclist.alls, '/Users/wenpinhou/Dropbox/trajectory_variability/tree_variability/result/samples/pm_oc_alls.rds') -saveRDS(corrlist.alls, '/Users/wenpinhou/Dropbox/trajectory_variability/tree_variability/samples/result/pm_oc_alls.rds') - -# jsm <- do.call(rbind, jslist) -# ocm <- do.call(rbind, oclist) -# par(mfrow = c(1,2)) -# hist(jsm) -# hist(ocm) -s = unique(alls)[1] -df.alls <- lapply(unique(alls), function(s){ - jslist = sapply(jslist.alls, function(i){ - sapply(i, function(ii) ii[,s]) - }, simplify = FALSE) - oclist = sapply(oclist.alls, function(i){ - sapply(i, function(ii) ii[,s]) - }, simplify = FALSE) - corrlist = sapply(corrlist.alls, function(i){ - sapply(i, function(ii) ii[,s]) - }, simplify = FALSE) - - res <- corr.score <- list() - for (i in seq(1, length(jslist))){ - print(i) - js <- jslist[[i]] - js.binary <- sapply(seq(1,ncol(js)), function(c){ - (js[,c] > js.cut[c]) + 0 - }) - while (length(which(rowSums(js.binary) > 1)) > 0 | length(which(colSums(js.binary) > 1)) > 0){ - dup.id <- which(rowSums(js.binary) > 1) - if (length(dup.id) == 1){ - addid <- which.max(js[dup.id, ]) - js.binary[dup.id, ] <- 0 - js.binary[dup.id, addid] <- 1 - } else if (length(dup.id) > 1) { - for (dup.i in dup.id){ - print(dup.i) - addid <- which.max(js[dup.i, ]) - js.binary[dup.i, ] <- 0 - js.binary[dup.i, addid] <- 1 - } - } - - dup.id <- which(colSums(js.binary) > 1) - if (length(dup.id) == 1){ - addid <- which.max(js[, dup.id]) - js.binary[, dup.id] <- 0 - js.binary[addid, dup.id] <- 1 - } else if (length(dup.id) > 1) { - for (dup.i in dup.id){ - addid <- which.max(js[, dup.i]) - js.binary[, dup.i] <- 0 - js.binary[addid, dup.i] <- 1 - } - } - } - - corr.score[[i]] <- corrlist[[i]] * js.binary - js.melt <- melt(js.binary) - js.melt <- js.melt[js.melt[,3]!=0,] - res[[i]] <- as.character(js.melt[,2]) - } - res <- unlist(res) - js.perc <- table(res)/n.permute - names(js.perc) <- newbranch - # saveRDS(js.perc, '/Users/wenpinhou/Dropbox/trajectory_variability/tree_variability/result/samples/js_percentage.rds') - - corr.score.m <- do.call(rbind, corr.score) - corr.score.v <- colSums(corr.score.m)/n.permute - names(corr.score.v) <- newbranch - # saveRDS(corr.score.v, '/Users/wenpinhou/Dropbox/trajectory_variability/tree_variability/result/samples/corr_score.rds') - - res <- sapply(seq(1,length(oclist)), function(i){ - print(i) - oc <- oclist[[i]] - oc.binary <- sapply(seq(1,ncol(oc)), function(c){ - (oc[,c] > oc.cut[c]) + 0 - }) - while (length(which(rowSums(oc.binary) > 1)) > 0 | length(which(colSums(oc.binary) > 1)) > 0){ - dup.id <- which(rowSums(oc.binary) > 1) - if (length(dup.id) == 1){ - addid <- which.max(oc[dup.id, ]) - oc.binary[dup.id, ] <- 0 - oc.binary[dup.id, addid] <- 1 - } else if (length(dup.id) > 1) { - for (dup.i in dup.id){ - addid <- which.max(oc[dup.i, ]) - oc.binary[dup.i, ] <- 0 - oc.binary[dup.i, addid] <- 1 - } - } - dup.id <- which(colSums(oc.binary) > 1) - if (length(dup.id) == 1){ - addid <- which.max(oc[, dup.id]) - oc.binary[, dup.id] <- 0 - oc.binary[addid, dup.id] <- 1 - } else if (length(dup.id) > 1) { - for (dup.i in dup.id){ - addid <- which.max(oc[, dup.i]) - oc.binary[, dup.i] <- 0 - oc.binary[addid, dup.i] <- 1 - } - } - } - oc.melt <- melt(oc.binary) - oc.melt <- oc.melt[oc.melt[,3]!=0,] - as.character(oc.melt[,2]) - }) - res <- unlist(res) - oc.perc <- table(res)/n.permute - names(oc.perc) <- newbranch - sort((js.perc + oc.perc)/2) - - df <- data.frame(js.perc = js.perc, oc.perc = oc.perc, corr.score.v = corr.score.v) - df <- df[, c(2,4,5)] - -# saveRDS(oc.perc, '/Users/wenpinhou/Dropbox/trajectory_variability/tree_variability/result/samples/oc_percentage.rds') - -}) -names(df.alls) <- unique(alls) -df.alls[order(names(df.alls))] - - - diff --git a/hca_bone_marrow_data_analysis/tree_variability/code/03_try_to_build_module.R b/hca_bone_marrow_data_analysis/tree_variability/code/03_try_to_build_module.R deleted file mode 100644 index c822909..0000000 --- a/hca_bone_marrow_data_analysis/tree_variability/code/03_try_to_build_module.R +++ /dev/null @@ -1,443 +0,0 @@ -rm(list=ls()) -library(ggplot2) -library(Seurat) -library(reshape2) -library(TSCAN) -library(scattermore) -library(RColorBrewer) -suppressMessages(library(igraph)) -n.permute <- 3 -max.clunum <- 50 -setwd("/Users/wenpinhou/Dropbox/trajectory_variability/hca/data/HCA/proc/integrate") - -# -------------------------------------------------------------- -# input: seurat integrated object including: -# umap, pca -# celltype: a dataframe, col 1 is cell name, col 2 is cell type -# origin: the origin cell type -# -------------------------------------------------------------- -# read in data -umap = readRDS('umap.rds') -pca <- as.matrix(umap@reductions$pca@cell.embeddings) -# ctlevel <- data.frame(ct=c('HSC','MPP','LMPP','CMP','CLP','GMP','MEP',"Bcell","CD4Tcell","CD8Tcell",'NKcell','Mono','Ery'),level=c(1,2,3,3,4,4,4,5,5,5,5,5,5),immunepath=c(1,1,1,0,1,0,0,1,1,1,1,0,0),monopath=c(1,1,1,1,0,1,0,0,0,0,0,1,0),erypath=c(1,1,0,1,0,0,1,0,0,0,0,0,1),stringsAsFactors = F) -str(pca) -a = readRDS('/Users/wenpinhou/Dropbox/trajectory_variability/hca/data/HCA/proc/ct/sc.rds') -ct = data.frame(cell = names(a), celltype = a, stringsAsFactors = FALSE) -alls <- sub(':.*', '', names(a)) -names(alls) <- names(a) - -mykmeans <- function(matrix, number.cluster = NA){ - ## cluster the rows - set.seed(12345) - library(parallel) - if (is.na(number.cluster)){ - maxclunum <- 20 - rss <- mclapply(1:maxclunum,function(clunum) { - tmp <- kmeans(matrix,clunum,iter.max = 1000) - tmp$betweenss/tmp$totss - },mc.cores=20) - rss <- unlist(rss) - x <- 1:maxclunum - optclunum <- which.min(sapply(1:maxclunum, function(i) { - x2 <- pmax(0, x - i) - sum(lm(rss ~ x + x2)$residuals^2) ## check this - })) - clu <- kmeans(matrix,optclunum) - } else { - clu <- kmeans(matrix, number.cluster) - } - return(clu) -} - -### determine numPC -set.seed(12345) -sdev <- apply(pca, 2, sd) -x <- 1:max.clunum -optpoint <- which.min(sapply(2:max.clunum, function(i) { - x2 <- pmax(0, x - i) - sum(lm(sdev[1:max.clunum] ~ x + x2)$residuals^2) -})) -pcadim = optpoint + 1 -pr <- pca[,1:pcadim] # 7 - -## clustering -clu <- mykmeans(pr, number.cluster = 14)$cluster -# pd = data.frame(x = pr[,1], y = pr[,2], clu = as.factor(clu[rownames(pr)])) -# mypalette = colorRampPalette(brewer.pal(9,'Set1')) -# ggplot(data = pd, aes(x = x, y = y, color = clu)) + -# geom_scattermore()+ -# scale_color_manual(values = mypalette(14))+ -# theme_classic() + xlab('UMAP1') + ylab('UMAP2') - -# ## cell type composition in clusters -# pd = cbind(pd, celltype = ct[match(rownames(pd), ct[,1]),2]) -# tab <- table(pd[,3:4]) -# tab <- tab/rowSums(tab) -# pd <- melt(tab) -# pd$clu <- factor(as.character(pd$clu), levels = seq(1,max(pd$clu))) -# -# ggplot(data = pd) + -# geom_bar(aes(x = clu, y = value, fill = celltype), stat = 'identity', position = 'dodge') + -# theme_classic() + -# ylab('Celltype Proportion') + -# scale_fill_manual(values = mypalette(length(unique(pd$celltype)))) - -### mclust -mcl <- exprmclust(t(pr),cluster=clu,reduce=F) -# mcl <- exprmclust(t(pr), reduce = F) -# plotmclust(mcl, cell_point_size = 0.1) -# str(mcl) - -# -------------------- -# construct pseudotime -# -------------------- -## find origin -pd = data.frame(x = pr[,1], y = pr[,2], clu = as.factor(mcl$clusterid)) -pd = cbind(pd, celltype = ct[match(rownames(pd), ct[,1]),2]) -tab <- table(pd[,3:4]) -tab <- tab/rowSums(tab) -pd <- melt(tab) -pd$clu <- factor(as.character(pd$clu), levels = seq(1,max(pd$clu))) -tmp <- pd[pd$celltype == 'HSC', ] -origin.cluster <- as.numeric(tmp[which.max(tmp[,3]), 1]) - -## construct pseudotime -ord <- TSCANorder(mcl, startcluster = origin.cluster, listbranch = T,orderonly = T) -str(ord) -length(ord) -pt <- unlist(sapply(sapply(ord, length), function(i) seq(1, i))) -names(pt) <- unname(unlist(ord)) - -# ## plot pseudotime -# pd = data.frame(pc1 = pca[,1], pc2 = pca[,2], time = as.numeric(pt[rownames(pca)])) -# library(scattermore) -# library(RColorBrewer) -# ggplot(data = pd, aes(x = pc1, y = pc2, color = time)) + -# geom_scattermore() + -# scale_color_gradient(low = 'yellow', high = 'blue') - -# ------------------------------------------------------------ -# get candidate branches to test reproducibility, 20200726 >> -# ------------------------------------------------------------ -findbranch <- function(mst, order, origin){ - deg <- degree(mst) - vertex <- names(deg[which(deg > 2 | deg == 1)]) - if (!origin %in% vertex) vertex <- c(origin, vertex) - eg <- expand.grid(1:length(vertex), 1:length(vertex)) - eg <- eg[eg[,1]0] - - allbranch <- gsub('backbone ', '', gsub('branch: ', '', names(order))) - allbranch <- sapply(allbranch, function(i) strsplit(i, ',')[[1]]) - allbranch <- paste0(names(allbranch), collapse = ' ') - newbranch <-sapply(tmpbranch, function(i) { - tmp <- paste0(i, collapse = ',') - if (!grepl(tmp, allbranch)){ - rev(i) - } else { - i - } - }) - return(newbranch) -} -newbranch <- findbranch(mst = mcl$MSTtree, order = ord, origin = origin.cluster) - -# ----------------------------------------------------- -# Evaluate robustness of tree branches using resampling -# ----------------------------------------------------- - -# null distribution of Jaccard index, overlap coefficient - -js.null <- lapply(seq(1, length(newbranch)), function(i) { - b.ori <- unlist(sapply(newbranch[[i]], function(c) names(mcl$clusterid[mcl$clusterid == c]))) - tmp <- sapply(seq(1, 1e3), function(j){ - set.seed(j) - b.pm <- sample(rownames(pr), length(b.ori)) - length(intersect(b.pm, b.ori))/length(union(b.pm, b.ori)) - }) -}) - -# par(mfrow = c(2,ceiling(length(js.null)/2))) -# for (i in js.null) hist(i, xlab = 'js', main = '', breaks = 50) - -js.cut <- sapply(js.null, quantile, 0.99) - -oc.null <- lapply(seq(1, length(newbranch)), function(i){ - b.ori <- unlist(sapply(newbranch[[i]], function(c) names(mcl$clusterid[mcl$clusterid == c]))) - tmp <- sapply(seq(1, 1e3), function(j){ - set.seed(j) - b.pm <- sample(rownames(pr), length(b.ori)) - length(intersect(b.pm, b.ori))/min(length(b.pm), length(b.ori)) - }) -}) -# par(mfrow = c(2,ceiling(length(oc.null)/2))) -# for (i in oc.null) hist(i, xlab = 'oc', main = '', breaks = 50) -oc.cut <- sapply(oc.null, quantile, 0.99) - -# permutation - -get_binary <- function(js){ - js.binary <- sapply(seq(1,ncol(js)), function(c){ - (js[,c] > js.cut[c]) + 0 - }) - while (length(which(rowSums(js.binary) > 1)) > 0 | length(which(colSums(js.binary) > 1)) > 0){ - dup.id <- which(rowSums(js.binary) > 1) - if (length(dup.id) == 1){ - addid <- which.max(js[dup.id, ]) - js.binary[dup.id, ] <- 0 - js.binary[dup.id, addid] <- 1 - } else if (length(dup.id) > 1) { - for (dup.i in dup.id){ - addid <- which.max(js[dup.i, ]) - js.binary[dup.i, ] <- 0 - js.binary[dup.i, addid] <- 1 - } - } - - dup.id <- which(colSums(js.binary) > 1) - if (length(dup.id) == 1){ - addid <- which.max(js[, dup.id]) - js.binary[, dup.id] <- 0 - js.binary[addid, dup.id] <- 1 - } else if (length(dup.id) > 1) { - for (dup.i in dup.id){ - addid <- which.max(js[, dup.i]) - js.binary[, dup.i] <- 0 - js.binary[addid, dup.i] <- 1 - } - } - } - return(js.binary) -} - -ctcomplist <- reproduce <- corr.score <- corrlist <- jslist <- oclist <- list() - -for (pmid in seq(1, n.permute)){ - ## boostrap cells - print(pmid) - set.seed(pmid) - pr.pm <- pr[sample(rownames(pr), nrow(pr), replace = TRUE),] - pr.pm <- pr.pm[!duplicated(rownames(pr.pm)),] - - ## cluster cells - clu <- mykmeans(pr.pm, number.cluster = 14)$cluster ### - - # --- check if these codes are necessary <<<<<<<<<<<<<<<< - pd = data.frame(x = pr[names(clu),1], y = pr[names(clu),2], clu = as.factor(clu)) - pd.text.x = tapply(pd[,1], list(pd$clu), mean) - pd.text.y = tapply(pd[,2], list(pd$clu), mean) - pd.text = data.frame(x = pd.text.x, y = pd.text.y, clu = names(pd.text.x)) - pd.text[14,1:2] = c(pd.text[14,1] + 2, pd.text[14,2] + 1) - # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> - # ggplot() + - # geom_scattermore(data = pd, aes(x = x, y = y, color = clu))+ - # scale_color_manual(values = mypalette(14))+ - # theme_classic() + xlab('UMAP1') + ylab('UMAP2') + - # geom_text(data = pd.text, aes(x = x, y = y, label = clu)) - - - ## cell type composition in clusters - # pd = cbind(pd, celltype = ct[match(rownames(pd), ct[,1]),2]) - # tab <- table(pd[,3:4]) - # tab <- tab/rowSums(tab) - # pd <- melt(tab) - # pd$clu <- factor(as.character(pd$clu), levels = seq(1,max(pd$clu))) - # ggplot(data = pd) + - # geom_bar(aes(x = clu, y = value, fill = celltype), stat = 'identity', position = 'dodge') + - # theme_classic() + - # ylab('Celltype Proportion') + - # scale_fill_manual(values = mypalette(length(unique(pd$celltype)))) - - ## build pseudotime - mcl.pm <- exprmclust(t(pr.pm), cluster = clu, reduce = FALSE) ### - # plotmclust(mcl.pm, cell_point_size = 0.1) - - ## select origin cluster - pt.pm.mean<- tapply(pt[names(mcl.pm[['clusterid']])], list(mcl.pm[['clusterid']]), mean) - start.cluster <- names(which.min(pt.pm.mean)) - - ## construct pseudotime - ord.pm <- TSCANorder(mcl.pm, startcluster = start.cluster, listbranch = T,orderonly = T) - # str(ord.pm) - - pt.pm <- unlist(sapply(sapply(ord.pm, length), function(i) seq(1, i))) - names(pt.pm) <- unname(unlist(ord.pm)) - # --- check if these codes are necessary <<<<<<<<<<<<<<<< - ## plot pseudotime - - pd = data.frame(pc1 = pca[,1], pc2 = pca[,2], time = as.numeric(pt.pm[rownames(pca)])) - # ggplot(data = pd, aes(x = pc1, y = pc2, color = time)) + - # geom_scattermore() + theme_classic() - # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> - - # get candidate branches - newbranch.pm <- findbranch(mst = mcl.pm$MSTtree, order = ord.pm, origin = start.cluster) - - ## compare two MST - js <- sapply(seq(1, length(newbranch)), function(i){ - id <- which(sapply(paste0(names(ord),','), function(k) grepl(paste0(paste0(newbranch[[i]], collapse = ','),','), k)))[1] - cells <- ord[[id]] - b.ori <- intersect(unlist(sapply(newbranch[[i]], function(k) names(mcl$clusterid)[mcl$clusterid == k])), cells) - sapply(seq(1, length(newbranch.pm)), function(j){ - print(j) - id <- which(sapply(paste0(names(ord.pm),','), function(k) grepl(paste0(paste0(newbranch.pm[[j]], collapse = ','),','), k)))[1] - cells <- ord.pm[[id]] - b.pm <- intersect(unlist(sapply(newbranch.pm[[j]], function(k) names(mcl.pm$clusterid)[mcl.pm$clusterid == k])), cells) - js <- length(intersect(b.pm, b.ori))/length(union(b.pm, b.ori)) - }) - }) - oc <- sapply(seq(1, length(newbranch)), function(i){ - id <- which(sapply(paste0(names(ord),','), function(k) grepl(paste0(paste0(newbranch[[i]], collapse = ','),','), k)))[1] - cells <- ord[[id]] - b.ori <- intersect(unlist(sapply(newbranch[[i]], function(k) names(mcl$clusterid)[mcl$clusterid == k])), cells) - sapply(seq(1, length(newbranch.pm)), function(j){ - id <- which(sapply(paste0(names(ord.pm),','), function(k) grepl(paste0(paste0(newbranch.pm[[j]], collapse = ','),','), k)))[1] - cells <- ord.pm[[id]] - b.pm <- intersect(unlist(sapply(newbranch.pm[[j]], function(k) names(mcl.pm$clusterid)[mcl.pm$clusterid == k])), cells) - oc <- length(intersect(b.pm, b.ori))/min(length(b.pm), length(b.ori)) - }) - }) - corr <- sapply(seq(1, length(newbranch)), function(i){ - id <- which(sapply(paste0(names(ord),','), function(k) grepl(paste0(paste0(newbranch[[i]], collapse = ','),','), k)))[1] - cells <- ord[[id]] - b.ori <- intersect(unlist(sapply(newbranch[[i]], function(k) names(mcl$clusterid)[mcl$clusterid == k])), cells) - - sapply(seq(1, length(newbranch.pm)), function(j){ - id <- which(sapply(paste0(names(ord.pm),','), function(k) grepl(paste0(paste0(newbranch.pm[[j]], collapse = ','),','), k)))[1] - cells <- ord.pm[[id]] - b.pm <- intersect(unlist(sapply(newbranch.pm[[j]], function(k) names(mcl.pm$clusterid)[mcl.pm$clusterid == k])), cells) - ov = intersect(b.ori, b.pm) - cor(pt[ov], pt.pm[ov]) - }) - }) - corr[is.na(corr)] <- 0 - colnames(corr) <- colnames(oc) <- colnames(js) <- paste0('original', seq(1, length(newbranch))) - jslist[[pmid]] <- js - oclist[[pmid]] <- oc - corrlist[[pmid]] <- corr - - ## get js binary to matched branches <<<<<<<<<<<<<<< - js.binary <- get_binary(js) - corr.score[[pmid]] <- corr * js.binary - js.melt <- melt(js.binary) - js.melt <- js.melt[js.melt[,3]!=0,] - colnames(js.melt) <- c('permutation.branch','original.branch','matched') - reproduce[[pmid]] <- as.character(js.melt[,2]) - ## >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> - ## - tmp <- js.melt[1,2] - ctcomp <- sapply(js.melt[,2], function(tmp){ - c <- names(clu)[clu %in% newbranch.pm[[tmp]]] - ctcomp <- rep(0, length(unique(alls))) - names(ctcomp) <- unique(alls) - ctcomp[names(table(alls[c]))] <- table(alls[c]) - }) - colnames(ctcomp) <- paste0('origin', js.melt[,2]) - ctcomp <- ctcomp/rowSums(ctcomp) - - - ctcomp.new <- matrix(0, nrow = length(unique(alls)), ncol = length(newbranch)) - colnames(ctcomp.new) <- paste0('origin', seq(1, length(newbranch))) - rownames(ctcomp.new) <- unique(alls) - ctcomp.new[rownames(ctcomp), colnames(ctcomp)] <- ctcomp - ctcomplist[[pmid]] <- t(ctcomp.new) - -} - -# saveRDS(jslist, '/Users/wenpinhou/Dropbox/trajectory_variability/tree_variability/result/pm_js.rds') -# saveRDS(oclist, '/Users/wenpinhou/Dropbox/trajectory_variability/tree_variability/result/pm_oc.rds') -# -# saveRDS(corrlist, '/Users/wenpinhou/Dropbox/trajectory_variability/tree_variability/result/pm_oc.rds') - -jsm <- do.call(rbind, jslist) -ocm <- do.call(rbind, oclist) -# par(mfrow = c(1,2)) -# hist(jsm) -# hist(ocm) - -## moved within boostrap -# reproduce <- corr.score <- list() -# for (i in seq(1, length(jslist))){ -# print(i) -# js <- jslist[[i]] -# js.binary <- get_binary(js) -# corr.score[[i]] <- corrlist[[i]] * js.binary -# js.melt <- melt(js.binary) -# js.melt <- js.melt[js.melt[,3]!=0,] -# colnames(js.melt) <- c('permutation.branch','original.branch','matched') -# reproduce[[i]] <- as.character(js.melt[,2]) -# } - -reproduce <- unlist(reproduce) - - -js.perc <- rep(0, length(newbranch)) -js.perc[as.numeric(names(table(reproduce)))] <- table(reproduce)/n.permute -names(js.perc) <- newbranch -# saveRDS(js.perc, '/Users/wenpinhou/Dropbox/trajectory_variability/tree_variability/result/js_percentage.rds') - -corr.score.m <- do.call(rbind, corr.score) -corr.score.v <- colSums(corr.score.m)/n.permute -names(corr.score.v) <- newbranch -# saveRDS(corr.score.v, '/Users/wenpinhou/Dropbox/trajectory_variability/tree_variability/result/corr_score.rds') - -res <- sapply(seq(1,length(oclist)), function(i){ - print(i) - oc <- oclist[[i]] - oc.binary <- sapply(seq(1,ncol(oc)), function(c){ - (oc[,c] > oc.cut[c]) + 0 - }) - while (length(which(rowSums(oc.binary) > 1)) > 0 | length(which(colSums(oc.binary) > 1)) > 0){ - dup.id <- which(rowSums(oc.binary) > 1) - if (length(dup.id) == 1){ - addid <- which.max(oc[dup.id, ]) - oc.binary[dup.id, ] <- 0 - oc.binary[dup.id, addid] <- 1 - } else if (length(dup.id) > 1) { - for (dup.i in dup.id){ - addid <- which.max(oc[dup.i, ]) - oc.binary[dup.i, ] <- 0 - oc.binary[dup.i, addid] <- 1 - } - } - dup.id <- which(colSums(oc.binary) > 1) - if (length(dup.id) == 1){ - addid <- which.max(oc[, dup.id]) - oc.binary[, dup.id] <- 0 - oc.binary[addid, dup.id] <- 1 - } else if (length(dup.id) > 1) { - for (dup.i in dup.id){ - addid <- which.max(oc[, dup.i]) - oc.binary[, dup.i] <- 0 - oc.binary[addid, dup.i] <- 1 - } - } - } - oc.melt <- melt(oc.binary) - oc.melt <- oc.melt[oc.melt[,3]!=0,] - as.character(oc.melt[,2]) -}) -res <- unlist(res) -oc.perc <- rep(0, length(newbranch)) -oc.perc[as.numeric(names(table(res)))] <- table(res)/n.permute -names(oc.perc) <- newbranch -sort((js.perc + oc.perc)/2) - -# saveRDS(oc.perc, '/Users/wenpinhou/Dropbox/trajectory_variability/tree_variability/result/oc_percentage.rds') - -detection.rate <- data.frame(detection.rate = (js.perc + oc.perc[names(js.perc)])/2, stringsAsFactors = FALSE) -sample.cellcomp.mean <- apply(simplify2array(ctcomplist), 1:2, mean) -sample.cellcomp.sd <- apply(simplify2array(ctcomplist), 1:2, sd) -rownames(sample.cellcomp.mean) <- newbranch[as.numeric(sub('origin', '', rownames(sample.cellcomp.mean)))] -rownames(sample.cellcomp.sd) <- newbranch[as.numeric(sub('origin', '', rownames(sample.cellcomp.sd)))] - -result <- list(detection.rate = detection.rate, - sample.cellcomp.mean = sample.cellcomp.mean, - sample.cellcomp.sd = sample.cellcomp.sd) - diff --git a/hca_bone_marrow_data_analysis/tree_variability/code/04_try_to_build_module_v2.R b/hca_bone_marrow_data_analysis/tree_variability/code/04_try_to_build_module_v2.R deleted file mode 100644 index 2c6331c..0000000 --- a/hca_bone_marrow_data_analysis/tree_variability/code/04_try_to_build_module_v2.R +++ /dev/null @@ -1,404 +0,0 @@ -rm(list=ls()) -library(ggplot2) -library(Seurat) -library(reshape2) -library(TSCAN) -library(scattermore) -library(RColorBrewer) -suppressMessages(library(igraph)) -n.permute <- 3 -max.clunum <- 50 -setwd("/Users/wenpinhou/Dropbox/trajectory_variability/hca/data/HCA/proc/integrate") - -# -------------------------------------------------------------- -# input: seurat integrated object including: -# umap, pca -# celltype: a dataframe, col 1 is cell name, col 2 is cell type -# origin: the origin cell type -# -------------------------------------------------------------- -# read in data -umap = readRDS('umap.rds') -pca <- as.matrix(umap@reductions$pca@cell.embeddings) -# ctlevel <- data.frame(ct=c('HSC','MPP','LMPP','CMP','CLP','GMP','MEP',"Bcell","CD4Tcell","CD8Tcell",'NKcell','Mono','Ery'),level=c(1,2,3,3,4,4,4,5,5,5,5,5,5),immunepath=c(1,1,1,0,1,0,0,1,1,1,1,0,0),monopath=c(1,1,1,1,0,1,0,0,0,0,0,1,0),erypath=c(1,1,0,1,0,0,1,0,0,0,0,0,1),stringsAsFactors = F) -str(pca) -a = readRDS('/Users/wenpinhou/Dropbox/trajectory_variability/hca/data/HCA/proc/ct/sc.rds') -ct = data.frame(cell = names(a), celltype = a, stringsAsFactors = FALSE) - -mykmeans <- function(matrix, number.cluster = NA){ - ## cluster the rows - set.seed(12345) - library(parallel) - if (is.na(number.cluster)){ - maxclunum <- 20 - rss <- mclapply(1:maxclunum,function(clunum) { - tmp <- kmeans(matrix,clunum,iter.max = 1000) - tmp$betweenss/tmp$totss - },mc.cores=20) - rss <- unlist(rss) - x <- 1:maxclunum - optclunum <- which.min(sapply(1:maxclunum, function(i) { - x2 <- pmax(0, x - i) - sum(lm(rss ~ x + x2)$residuals^2) ## check this - })) - clu <- kmeans(matrix,optclunum) - } else { - clu <- kmeans(matrix, number.cluster) - } - return(clu) -} -findbranch <- function(mst, order, origin){ - deg <- degree(mst) - vertex <- names(deg[which(deg > 2 | deg == 1)]) - if (!origin %in% vertex) vertex <- c(origin, vertex) - eg <- expand.grid(1:length(vertex), 1:length(vertex)) - eg <- eg[eg[,1]0] - - allbranch <- gsub('backbone ', '', gsub('branch: ', '', names(order))) - allbranch <- sapply(allbranch, function(i) strsplit(i, ',')[[1]]) - allbranch <- paste0(names(allbranch), collapse = ' ') - newbranch <-sapply(tmpbranch, function(i) { - tmp <- paste0(i, collapse = ',') - if (!grepl(tmp, allbranch)){ - rev(i) - } else { - i - } - }) - return(newbranch) -} -get_binary <- function(matrix, matrix.cut){ - ## match boostrap and origin branches. - ## matrix: #boostrap.branch * #origin.branch, values are js or oc - ## matrix.cut: js or oc null distribution cutoff - matrix.binary <- sapply(seq(1,ncol(matrix)), function(c){ - (matrix[,c] > matrix.cut[c]) + 0 - }) - while (length(which(rowSums(matrix.binary) > 1)) > 0 | length(which(colSums(matrix.binary) > 1)) > 0){ - dup.id <- which(rowSums(matrix.binary) > 1) - if (length(dup.id) == 1){ - addid <- which.max(matrix[dup.id, ]) - matrix.binary[dup.id, ] <- 0 - matrix.binary[dup.id, addid] <- 1 - } else if (length(dup.id) > 1) { - for (dup.i in dup.id){ - addid <- which.max(matrix[dup.i, ]) - matrix.binary[dup.i, ] <- 0 - matrix.binary[dup.i, addid] <- 1 - } - } - - dup.id <- which(colSums(matrix.binary) > 1) - if (length(dup.id) == 1){ - addid <- which.max(matrix[, dup.id]) - matrix.binary[, dup.id] <- 0 - matrix.binary[addid, dup.id] <- 1 - } else if (length(dup.id) > 1) { - for (dup.i in dup.id){ - addid <- which.max(matrix[, dup.i]) - matrix.binary[, dup.i] <- 0 - matrix.binary[addid, dup.i] <- 1 - } - } - } - return(matrix.binary) -} - -### determine numPC -infer_tree_structure <- function(pca, ct, origin.celltype){ - alls <- sub(':.*', '', ct$cell) - names(alls) <- ct$cell - set.seed(12345) - sdev <- apply(pca, 2, sd) - x <- 1:max.clunum - optpoint <- which.min(sapply(2:max.clunum, function(i) { - x2 <- pmax(0, x - i) - sum(lm(sdev[1:max.clunum] ~ x + x2)$residuals^2) - })) - pcadim = optpoint + 1 - pr <- pca[,1:pcadim] # 7 - - ## clustering - clu <- mykmeans(pr, number.cluster = 14)$cluster - # pd = data.frame(x = pr[,1], y = pr[,2], clu = as.factor(clu[rownames(pr)])) - # mypalette = colorRampPalette(brewer.pal(9,'Set1')) - # ggplot(data = pd, aes(x = x, y = y, color = clu)) + - # geom_scattermore()+ - # scale_color_manual(values = mypalette(14))+ - # theme_classic() + xlab('UMAP1') + ylab('UMAP2') - - # ## cell type composition in clusters - # pd = cbind(pd, celltype = ct[match(rownames(pd), ct[,1]),2]) - # tab <- table(pd[,3:4]) - # tab <- tab/rowSums(tab) - # pd <- melt(tab) - # pd$clu <- factor(as.character(pd$clu), levels = seq(1,max(pd$clu))) - # - # ggplot(data = pd) + - # geom_bar(aes(x = clu, y = value, fill = celltype), stat = 'identity', position = 'dodge') + - # theme_classic() + - # ylab('Celltype Proportion') + - # scale_fill_manual(values = mypalette(length(unique(pd$celltype)))) - - ### mclust - mcl <- exprmclust(t(pr),cluster=clu,reduce=F) - # mcl <- exprmclust(t(pr), reduce = F) - # plotmclust(mcl, cell_point_size = 0.1) - # str(mcl) - - # -------------------- - # construct pseudotime - # -------------------- - ## find origin - pd = data.frame(x = pr[,1], y = pr[,2], clu = as.factor(mcl$clusterid)) - pd = cbind(pd, celltype = ct[match(rownames(pd), ct[,1]),2]) - tab <- table(pd[,3:4]) - tab <- tab/rowSums(tab) - pd <- melt(tab) - pd$clu <- factor(as.character(pd$clu), levels = seq(1,max(pd$clu))) - tmp <- pd[pd$celltype == origin.celltype, ] - origin.cluster <- as.numeric(tmp[which.max(tmp[,3]), 1]) - - ## construct pseudotime - ord <- TSCANorder(mcl, startcluster = origin.cluster, listbranch = T,orderonly = T) - str(ord) - length(ord) - pt <- unlist(sapply(sapply(ord, length), function(i) seq(1, i))) - names(pt) <- unname(unlist(ord)) - - # ## plot pseudotime - # pd = data.frame(pc1 = pca[,1], pc2 = pca[,2], time = as.numeric(pt[rownames(pca)])) - # library(scattermore) - # library(RColorBrewer) - # ggplot(data = pd, aes(x = pc1, y = pc2, color = time)) + - # geom_scattermore() + - # scale_color_gradient(low = 'yellow', high = 'blue') - - # ------------------------------------------------------------ - # get candidate branches to test reproducibility, 20200726 >> - # ------------------------------------------------------------ - - newbranch <- findbranch(mst = mcl$MSTtree, order = ord, origin = origin.cluster) - - # ----------------------------------------------------- - # Evaluate robustness of tree branches using resampling - # ----------------------------------------------------- - - # null distribution of Jaccard index, overlap coefficient - - js.null <- lapply(seq(1, length(newbranch)), function(i) { - b.ori <- unlist(sapply(newbranch[[i]], function(c) names(mcl$clusterid[mcl$clusterid == c]))) - tmp <- sapply(seq(1, 1e3), function(j){ - set.seed(j) - b.pm <- sample(rownames(pr), length(b.ori)) - length(intersect(b.pm, b.ori))/length(union(b.pm, b.ori)) - }) - }) - - # par(mfrow = c(2,ceiling(length(js.null)/2))) - # for (i in js.null) hist(i, xlab = 'js', main = '', breaks = 50) - - js.cut <- sapply(js.null, quantile, 0.99) - - oc.null <- lapply(seq(1, length(newbranch)), function(i){ - b.ori <- unlist(sapply(newbranch[[i]], function(c) names(mcl$clusterid[mcl$clusterid == c]))) - tmp <- sapply(seq(1, 1e3), function(j){ - set.seed(j) - b.pm <- sample(rownames(pr), length(b.ori)) - length(intersect(b.pm, b.ori))/min(length(b.pm), length(b.ori)) - }) - }) - # par(mfrow = c(2,ceiling(length(oc.null)/2))) - # for (i in oc.null) hist(i, xlab = 'oc', main = '', breaks = 50) - oc.cut <- sapply(oc.null, quantile, 0.99) - - mcl$pseudotime <- pt - mcl$branch <- newbranch - mcl$js.cut <- js.cut - mcl$oc.cut <- oc.cut - mcl$pca <- pr - mcl$order <- ord - mcl$allsample <- alls - return(mcl) -} -# permutation - -a = infer_tree_structure(pca = pca, ct = ct, origin.celltype = 'HSC') - -evaluate_uncertainty <- function(inferobj, n.permute){ - pr <- inferobj$pca - newbranch <- inferobj$branch - js.cut <- inferobj$js.cut - oc.cut <- inferobj$oc.cut - pt <- inferobj$pseudotime - ord <- inferobj$order - alls <- inferobj$allsample - ctcomplist <- reproduce.js <- reproduce.oc <- corr.score <- list() - for (pmid in seq(1, n.permute)){ - ## boostrap cells - print(pmid) - set.seed(pmid) - pr.pm <- pr[sample(rownames(pr), nrow(pr), replace = TRUE),] - pr.pm <- pr.pm[!duplicated(rownames(pr.pm)),] - - ## cluster cells - clu <- mykmeans(pr.pm, number.cluster = 14)$cluster ### - - # --- check if these codes are necessary <<<<<<<<<<<<<<<< - # pd = data.frame(x = pr[names(clu),1], y = pr[names(clu),2], clu = as.factor(clu)) - # pd.text.x = tapply(pd[,1], list(pd$clu), mean) - # pd.text.y = tapply(pd[,2], list(pd$clu), mean) - # pd.text = data.frame(x = pd.text.x, y = pd.text.y, clu = names(pd.text.x)) - # pd.text[14,1:2] = c(pd.text[14,1] + 2, pd.text[14,2] + 1) - # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> - # ggplot() + - # geom_scattermore(data = pd, aes(x = x, y = y, color = clu))+ - # scale_color_manual(values = mypalette(14))+ - # theme_classic() + xlab('UMAP1') + ylab('UMAP2') + - # geom_text(data = pd.text, aes(x = x, y = y, label = clu)) - - - ## cell type composition in clusters - # pd = cbind(pd, celltype = ct[match(rownames(pd), ct[,1]),2]) - # tab <- table(pd[,3:4]) - # tab <- tab/rowSums(tab) - # pd <- melt(tab) - # pd$clu <- factor(as.character(pd$clu), levels = seq(1,max(pd$clu))) - # ggplot(data = pd) + - # geom_bar(aes(x = clu, y = value, fill = celltype), stat = 'identity', position = 'dodge') + - # theme_classic() + - # ylab('Celltype Proportion') + - # scale_fill_manual(values = mypalette(length(unique(pd$celltype)))) - - ## build pseudotime - mcl.pm <- exprmclust(t(pr.pm), cluster = clu, reduce = FALSE) ### - # plotmclust(mcl.pm, cell_point_size = 0.1) - - ## select origin cluster - pt.pm.mean<- tapply(pt[names(mcl.pm[['clusterid']])], list(mcl.pm[['clusterid']]), mean) - start.cluster <- names(which.min(pt.pm.mean)) - - ## construct pseudotime - ord.pm <- TSCANorder(mcl.pm, startcluster = start.cluster, listbranch = T,orderonly = T) - # str(ord.pm) - - pt.pm <- unlist(sapply(sapply(ord.pm, length), function(i) seq(1, i))) - names(pt.pm) <- unname(unlist(ord.pm)) - # --- check if these codes are necessary <<<<<<<<<<<<<<<< - ## plot pseudotime - - pd = data.frame(pc1 = pca[,1], pc2 = pca[,2], time = as.numeric(pt.pm[rownames(pca)])) - # ggplot(data = pd, aes(x = pc1, y = pc2, color = time)) + - # geom_scattermore() + theme_classic() - # >>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>> - - # get candidate branches - newbranch.pm <- findbranch(mst = mcl.pm$MSTtree, order = ord.pm, origin = start.cluster) - - ## compare two MST - js <- sapply(seq(1, length(newbranch)), function(i){ - id <- which(sapply(paste0(names(ord),','), function(k) grepl(paste0(paste0(newbranch[[i]], collapse = ','),','), k)))[1] - cells <- ord[[id]] - b.ori <- intersect(unlist(sapply(newbranch[[i]], function(k) names(inferobj$clusterid)[inferobj$clusterid == k])), cells) - sapply(seq(1, length(newbranch.pm)), function(j){ - print(j) - id <- which(sapply(paste0(names(ord.pm),','), function(k) grepl(paste0(paste0(newbranch.pm[[j]], collapse = ','),','), k)))[1] - cells <- ord.pm[[id]] - b.pm <- intersect(unlist(sapply(newbranch.pm[[j]], function(k) names(mcl.pm$clusterid)[mcl.pm$clusterid == k])), cells) - js <- length(intersect(b.pm, b.ori))/length(union(b.pm, b.ori)) - }) - }) - oc <- sapply(seq(1, length(newbranch)), function(i){ - id <- which(sapply(paste0(names(ord),','), function(k) grepl(paste0(paste0(newbranch[[i]], collapse = ','),','), k)))[1] - cells <- ord[[id]] - b.ori <- intersect(unlist(sapply(newbranch[[i]], function(k) names(inferobj$clusterid)[inferobj$clusterid == k])), cells) - sapply(seq(1, length(newbranch.pm)), function(j){ - id <- which(sapply(paste0(names(ord.pm),','), function(k) grepl(paste0(paste0(newbranch.pm[[j]], collapse = ','),','), k)))[1] - cells <- ord.pm[[id]] - b.pm <- intersect(unlist(sapply(newbranch.pm[[j]], function(k) names(mcl.pm$clusterid)[mcl.pm$clusterid == k])), cells) - oc <- length(intersect(b.pm, b.ori))/min(length(b.pm), length(b.ori)) - }) - }) - corr <- sapply(seq(1, length(newbranch)), function(i){ - id <- which(sapply(paste0(names(ord),','), function(k) grepl(paste0(paste0(newbranch[[i]], collapse = ','),','), k)))[1] - cells <- ord[[id]] - b.ori <- intersect(unlist(sapply(newbranch[[i]], function(k) names(inferobj$clusterid)[inferobj$clusterid == k])), cells) - - sapply(seq(1, length(newbranch.pm)), function(j){ - id <- which(sapply(paste0(names(ord.pm),','), function(k) grepl(paste0(paste0(newbranch.pm[[j]], collapse = ','),','), k)))[1] - cells <- ord.pm[[id]] - b.pm <- intersect(unlist(sapply(newbranch.pm[[j]], function(k) names(mcl.pm$clusterid)[mcl.pm$clusterid == k])), cells) - ov = intersect(b.ori, b.pm) - cor(pt[ov], pt.pm[ov]) - }) - }) - corr[is.na(corr)] <- 0 - colnames(corr) <- colnames(oc) <- colnames(js) <- paste0('original', seq(1, length(newbranch))) - - ## get js binary to match branches - js.binary <- get_binary(js, js.cut) - corr.score[[pmid]] <- corr * js.binary - js.melt <- melt(js.binary) - js.melt <- js.melt[js.melt[,3]!=0,] - colnames(js.melt) <- c('permutation.branch','original.branch','matched') - reproduce.js[[pmid]] <- as.character(js.melt[,2]) - - ## get oc binary to match branches - oc.binary <- get_binary(oc, oc.cut) - oc.melt <- melt(oc.binary) - oc.melt <- oc.melt[oc.melt[,3]!=0,] - reproduce.oc[[pmid]] <- as.character(oc.melt[,2]) - - ## samples cell compositions - ctcomp <- sapply(js.melt[,2], function(tmp){ - c <- names(clu)[clu %in% newbranch.pm[[tmp]]] - ctcomp <- rep(0, length(unique(alls))) - names(ctcomp) <- unique(alls) - ctcomp[names(table(alls[c]))] <- table(alls[c]) - }) - colnames(ctcomp) <- paste0('origin', js.melt[,2]) - ctcomp <- ctcomp/rowSums(ctcomp) - - ctcomp.new <- matrix(0, nrow = length(unique(alls)), ncol = length(newbranch)) - colnames(ctcomp.new) <- paste0('origin', seq(1, length(newbranch))) - rownames(ctcomp.new) <- unique(alls) - ctcomp.new[rownames(ctcomp), colnames(ctcomp)] <- ctcomp - ctcomplist[[pmid]] <- t(ctcomp.new) - } - - reproduce.js <- unlist(reproduce.js) - js.perc <- rep(0, length(newbranch)) - js.perc[as.numeric(names(table(reproduce.js)))] <- table(reproduce.js)/n.permute - names(js.perc) <- newbranch - - reproduce.oc <- unlist(reproduce.oc) - oc.perc <- rep(0, length(newbranch)) - oc.perc[as.numeric(names(table(reproduce.oc)))] <- table(reproduce.oc)/n.permute - names(oc.perc) <- newbranch - - corr.score.m <- do.call(rbind, corr.score) - corr.score.v <- colSums(corr.score.m)/n.permute - names(corr.score.v) <- newbranch - - sort((js.perc + oc.perc)/2) - - detection.rate <- data.frame(detection.rate = (js.perc + oc.perc[names(js.perc)])/2, stringsAsFactors = FALSE) - sample.cellcomp.mean <- apply(simplify2array(ctcomplist), 1:2, mean) - sample.cellcomp.sd <- apply(simplify2array(ctcomplist), 1:2, sd) - rownames(sample.cellcomp.mean) <- newbranch[as.numeric(sub('origin', '', rownames(sample.cellcomp.mean)))] - rownames(sample.cellcomp.sd) <- newbranch[as.numeric(sub('origin', '', rownames(sample.cellcomp.sd)))] - - result <- list(detection.rate = detection.rate, - sample.cellcomp.mean = sample.cellcomp.mean, - sample.cellcomp.sd = sample.cellcomp.sd) - return(result) -} - -result <- evaluate_uncertainty(a, 3) -