-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathsgcca.Rmd
149 lines (123 loc) · 5.03 KB
/
sgcca.Rmd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
---
title: "Fit the SGCCA"
output:
html_document:
df_print: paged
---
## 1. Fit a penalization value per omic
Rscript [fit_penalty.R](https://github.com/CSB-IG/SGCCA/blob/main/fit_penalty.R) 0.01 0.01 0.01→ 0.01.0.01.0.01.tsv
data matrices have to be in the same folder
update cl as needed
```{r}
suppressPackageStartupMessages({
library(mixOmics)#6.16.3
library(data.table)#1.14.2
library(parallel)#4.1.1
})
penalty_cpgs=0.01#as.numeric(args[1])
penalty_transcripts=0.01#as.numeric(args[2])
penalty_mir=0.01#as.numeric(args[3])
#take model descriptors<-----------------recycled
describe=function(data,pc,pt,pm){
#subsample observations
#data=lapply(data,function(x) x[sample(1:n,subn),])
resus=wrapper.sgcca(data,penalty=c(pc,pt,pm),scale=T,
scheme="centroid")
#get results description
description=as.data.frame(do.call(rbind,resus$AVE$AVE_X))
description$nfeatures=sapply(resus$loadings,function(x) sum(x!=0))
description$omic=rownames(description)
description$penalty=resus$penalty
colnames(description)[1]="AVE"
return(description)}
######################DATA TO LIST OF MATRIXES PER MOLECULAR LEVEL
files=list.files()
files=files[grep("eigeN",files)]
#sizes=c(Basal=129,Her2=47,LumA=417,LumB=141,Normal=76)
sizes=c(Basal=129,Her2=47)#just to show how this work, uncomment the line above
#cl <- parallel::makeCluster(10)
cl <- parallel::makeCluster(3)#just to show how this work, uncomment the line above
clusterExport(cl, c("describe","penalty_cpgs","penalty_transcripts",
"penalty_mir","wrapper.sgcca","files","sizes","fread"))
#resus=do.call(rbind,parLapply(cl,1:10,function(x) {
resus=do.call(rbind,parLapply(cl,1:3,function(x) {
#1 more than the samples coz of the rownames
i=lapply(sizes,function(x) c(1,sample(2:x,10)))
data=lapply(1:length(sizes),function(x) fread(files[x],select=i[[x]]))
data=lapply(data,function(x) as.matrix(x[,2:ncol(x)],rownames=x$V1))
data=do.call(cbind,data)
#separate omics
data=apply(cbind(c(1,393133,410210),c(393132,410209,410813)),1,
function(x) t(data[x[1]:x[2],]))
names(data)=c("CpGs","transcripts","miRNAs")
describe(data,penalty_cpgs,penalty_transcripts,penalty_mir)}))
stopCluster(cl)
head(resus)
```
## 1.1. Paste all together
```{bash}
#cat *.tsv>penalty_search.tsv
```
## 2. Chosing the penalization values with highest AVE and lowest nfeatures
[choose_penalty.R](https://github.com/CSB-IG/SGCCA/blob/main/choose_penalty.R) → Figure S1
```{r}
suppressPackageStartupMessages({
library(ggplot2)#3.3.5
library(gridExtra)#2.3
})
temp=read.table("penalty_search.tsv",sep='\t',header=T)
temp$omic=factor(temp$omic,levels=c("CpGs","transcripts","miRNAs"))
#temp=temp[order(temp$penalty),]
#plot median AVE vs meadian nfeatures
omics=levels(temp$omic)
omics=lapply(omics,function(x) temp[temp$omic==x,])
omics=lapply(omics,function(x) as.data.frame(apply(x,2,as.numeric)))
names(omics)=levels(temp$omic)
omics=lapply(omics,function(y) sapply(unique(y$penalty),function(x)
apply(y[y$penalty==x,],2,median,na.rm=T)))#better than mean?
omics=lapply(omics,function(x) as.data.frame(t(x)))
#indi plots or CpGs will determine axis
plots=lapply(1:3,function(x) ggplot(omics[[x]],
aes(x=nfeatures,y=AVE,col=penalty))+geom_point()+
ggtitle(names(omics)[x])+theme(text=element_text(size=18))+
scale_x_continuous(trans="log10")+geom_line())
#png("sparsity_search.png")
grid.arrange(plots[[1]],plots[[2]],plots[[3]])
#dev.off()
```
## 3. Run an SGCCA per subtype, using the adjusted penalization values
Rscript [sgcca.R](https://github.com/CSB-IG/SGCCA/blob/main/sgcca.R) Her2 → Her2.selected
```{r}
subtype="Her2"#args[1]
data=fread(paste(subtype,"eigeNormi",sep='.'))
data=as.matrix(data[,2:ncol(data)],rownames=data$V1)
#separate omics
data=apply(cbind(c(1,393133,410210),c(393132,410209,410813)),1,
function(x) t(data[x[1]:x[2],]))
names(data)=c("CpGs","transcripts","miRNAs")
penalty=c(CpGs=0.02,transcripts=0.02,miRNAs=0.05)#output of choose_penalty.R
#ncomp=nrow(data$miRNAs)-1#the last comp has all loadings>0
ncomp=2#exchange #with the line above
final=wrapper.sgcca(X=data,penalty=penalty,scale=F,
scheme="centroid",ncomp=ncomp)#ncomp to explain 50% of transcripts matrix according to mfa.R
#get selected features
selected=lapply(final$loadings,function(y)
apply(y,2,function(x) x[x!=0]))
selected=as.data.frame(do.call(rbind,lapply(selected,function(y)
do.call(rbind,lapply(1:length(y),function(x)
cbind(names(y)[x],y[[x]],names(y[[x]])))))))
colnames(selected)=c("component","final","variable")
head(selected)
#write.table(selected,paste(subtype,"selected",sep='.'),sep='\t',
# quote=F,row.names=F)
```
## 4. SGCCA with 100 subsets of the data to check the stability
Rscript [sgccaSubsample.R](https://github.com/CSB-IG/SGCCA/blob/main/sgccaSubsample.R) Her2 1→ Her2.1.selected
The same than sgcca.R but with a random subset of half the data
## 5. Count the features selected on the subsets to check the stability
Rscript [joinSubsamples.R](https://github.com/CSB-IG/SGCCA/blob/main/joinSubsamples.R) Her2 → Her2.subsampled
```{r}
subsampled=read.table("Her2.subsampled",sep='\t',header=T)
dim(subsampled)
subsampled[1:5,1:5]
```