-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy path200708_lvl4_mixall_metadata_pca.R
54 lines (46 loc) · 1.67 KB
/
200708_lvl4_mixall_metadata_pca.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
library(cmapR)
library(ranger)
library(pbapply)
# prepping data ---
if (exists("lvl4_data")) {
} else if (file.exists("~/Dropbox/GDB_archive/CMapCorr_files/lvl4_inputs.RData")) {
load("~/Dropbox/GDB_archive/CMapCorr_files/lvl4_inputs.RData")
} else {
source("lvl4_inputs.R")
}
temp <- load("~/Dropbox/GDB_archive/CMapCorr_files/lvl5_inputs.RData")
rm(list=c("temp",temp[!temp %in% c("ct14","lig16")]))
# PCA of Z-scores
temp_pca <- prcomp(t(lvl4_data@mat))
# add metadata to matrix
temp_data_frame <- cbind(lvl4_data@cdesc[,c("cell_id","pert_dose","pert_time")],
as.data.frame(temp_pca$x[,1:100]))
# Data saturation test ----
temp_lig_id <- sapply(lig16,function(L)
rownames(lvl4_data@cdesc)[lvl4_data@cdesc$pert_iname == L],
simplify=F)
trainIDs <- sapply(
seq(1,min(sapply(temp_lig_id,length)) - 1,1),
function(N)
sapply(temp_lig_id,function(X) sample(X,N),simplify=F),
simplify=F)
testIDs <- sapply(trainIDs,function(X)
mapply(function(all,train) setdiff(all,train),
all=temp_lig_id,train=X),
simplify=F)
trainIDs <- sapply(trainIDs,unlist,use.names=F)
testIDs <- sapply(testIDs,unlist,use.names=F)
# ^ training ----
pboptions(type="timer")
rfmodel <- pbsapply(seq_along(trainIDs),function(N)
ranger(x=temp_data_frame[trainIDs[[N]],],
y=as.factor(lvl4_data@cdesc[trainIDs[[N]],"pert_iname"]),
num.threads=8,
verbose=F),
simplify=F)
# ^ testing ----
rfresults <- pbsapply(seq_along(rfmodel),function(N)
predict(rfmodel[[N]],temp_data_frame[testIDs[[N]],]),
simplify=F)
save(rfmodel,rfresults,trainIDs,testIDs,
file="~/Dropbox/GDB_archive/CMapCorr_files/200708_lvl4_mixall_metadata_pca_saturated.RData")