-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathautomated_performance_ds.R
105 lines (85 loc) · 5.19 KB
/
automated_performance_ds.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
## Robustness testing - Downsampling of potent population OR proportional downsampling of all celltypes
#wd_string is a string describing the location of the folder containing all query datasets and gene lists
##validation dataset filename must contain string "mouse," "hum," "zebra," or "celeg" corresponding to its species
##gene lists must include S and G2M lists for each species in validation datasets
#pcs_all is a numeric vector of the number of PCs that should be inputted to FindNeighbors(). Values are chosen based on elbow plot inflection point for each dataset
#niter is the number of iterations to process validation data, run stemFinder and compute performance
#requires: Seurat, dplyr
automated_performance_ds <- function(wd_string = "~/data", pcs_all = pcs_all, niter = 3){
library(Seurat)
library(dplyr)
setwd(wd_string)
load("s_genes_mouse.rda")
load("g2m_genes_mouse.rda")
load("s_genes_human.rda")
load("g2m_genes_human.rda")
load("s_genes_celeg.rda")
load("g2m_genes_celeg.rda")
set.seed(123)
d=1
list_all = list(dataset = c(), iteration = c(), ds_ratio = c(), ds_type = c(), spear_all_sF = c(), auc_sF = c(), spear_pheno_sF = c(), pct.recov_sF = c())
filenames = list.files(pattern = ".rds", full.names = T)
for (f in 1:(length(filenames))){
id = gsub(f, pattern = ".rds", replacement = "")
id = gsub(id, pattern = "./", replacement = "")
adata = readRDS(f)
if (grepl("mouse", id, ignore.case = T) == T){
s_genes = s_genes_mouse[s_genes_mouse %in% rownames(adata)]
g2m_genes = g2m_genes_mouse[g2m_genes_mouse %in% rownames(adata)]
} else if (grepl("hum", id, ignore.case = T) == T | grepl("zebra", id, ignore.case = T) == T){
s_genes = s_genes_human[s_genes_human %in% rownames(adata)]
g2m_genes = g2m_genes_human[g2m_genes_human %in% rownames(adata)]
}else if (grepl("celeg", id, ignore.case = T) == T){
s_genes = s_genes_celeg[s_genes_celeg %in% rownames(adata)]
g2m_genes = s_genes_celeg[s_genes_celeg %in% rownames(adata)]
}
if(!('Ground_truth' %in% colnames([email protected]))){ #rename metadata column
adata$Ground_truth = adata$Order
}
#Default inputs of CC genes, PCs
markers = c(s_genes, g2m_genes)
pcs = pcs_all[which(filenames==f)] #PCs from previously examined elbow plot
#Downsampling
ds_ratio = rep(seq(0.1, 1, by = 0.1),2)
names(ds_ratio) = c(rep("All cell types", 10), rep("Highly potent only", 10))
for (iter in 1:niter){
for (r in 1:length(ds_ratio)){
ratio = ds_ratio[[r]]
type = names(ds_ratio[r])
#Proportional downsampling of all cell types
if (type == 'All cell types'){
num_all = round(table(adata$Phenotype)*ratio)
cells_sub = c()
for(l in 1:length(num_all)){
cells_sub = c(cells_sub, sample(rownames([email protected][adata$Phenotype == names(num_all[l]),]), size = num_all[[l]], replace = F)) #barcodes of cells in a given phenotype
}
adata_sub = subset(adata, cells = cells_sub)}
#Downsampling of highly potent population only, as defined by ground truth potency
else{cells_potent = rownames([email protected][adata$Ground_truth == min(adata$Ground_truth),])
num_potent = length(cells_potent)
cells_sub = c(rownames([email protected])[!rownames([email protected]) %in% cells_potent], sample(cells_potent, round(num_potent * ratio), replace = F))
adata_sub = subset(adata, cells = cells_sub)}
#HVG, scale, PCA, KNN
adata_sub = FindVariableFeatures(adata_sub, selection.method = 'vst', nfeatures = 2500)
VariableFeatures(adata_sub) = VariableFeatures(adata_sub)[!VariableFeatures(adata_sub) %in% markers]
adata_sub = ScaleData(adata_sub, features = rownames(adata_sub))
if (ncol(adata_sub) < 50){ #ensure PCs not greater than number of cells in dataset
adata_sub = RunPCA(adata_sub, npcs = (ncol(adata_sub) - 1))
}else {adata_sub = RunPCA(adata_sub, npcs = 50)}
k = round(sqrt(ncol(adata_sub))) #default value
if (pcs > ncol(adata_sub)){
pcs = ncol(adata_sub) - 1
}
adata_sub = FindNeighbors(adata_sub, dims = 1:pcs, k.param = k)
knn = adata_sub@graphs$RNA_nn
#Run stemFinder and compute performance
adata_sub = run_stemFinder(adata_sub, k = k, nn = knn, thresh = 0, markers = markers)
print("stemFinder finished running successfully!")
list_all = compute_performance_ds(adata_sub, d, id, list_all, iter = iter, ds_ratio = ratio, ds_type = type)
d = d+1
}
save(list_all, file = "list_all_downsample.rda")
}
}
return(list_all)
}