sgcca.Rmd

---
title: "Fit the SGCCA"
output:
  html_document:
    df_print: paged
---

## 1. Fit a penalization value per omic

Rscript [fit_penalty.R](https://github.com/CSB-IG/SGCCA/blob/main/fit_penalty.R) 0.01 0.01 0.01→ 0.01.0.01.0.01.tsv

data matrices have to be in the same folder

update cl as needed

```{r}
suppressPackageStartupMessages({
library(mixOmics)#6.16.3
library(data.table)#1.14.2
library(parallel)#4.1.1
})

penalty_cpgs=0.01#as.numeric(args[1])
penalty_transcripts=0.01#as.numeric(args[2])
penalty_mir=0.01#as.numeric(args[3])

#take model descriptors<-----------------recycled
describe=function(data,pc,pt,pm){
	#subsample observations
	#data=lapply(data,function(x) x[sample(1:n,subn),])
	resus=wrapper.sgcca(data,penalty=c(pc,pt,pm),scale=T,
		scheme="centroid")
	#get results description
	description=as.data.frame(do.call(rbind,resus$AVE$AVE_X))
	description$nfeatures=sapply(resus$loadings,function(x) sum(x!=0))
	description$omic=rownames(description)
	description$penalty=resus$penalty
	colnames(description)[1]="AVE"
return(description)}

######################DATA TO LIST OF MATRIXES PER MOLECULAR LEVEL

files=list.files()
files=files[grep("eigeN",files)]
#sizes=c(Basal=129,Her2=47,LumA=417,LumB=141,Normal=76)
sizes=c(Basal=129,Her2=47)#just to show how this work, uncomment the line above

#cl <- parallel::makeCluster(10)
cl <- parallel::makeCluster(3)#just to show how this work, uncomment the line above

clusterExport(cl, c("describe","penalty_cpgs","penalty_transcripts",
	"penalty_mir","wrapper.sgcca","files","sizes","fread"))
#resus=do.call(rbind,parLapply(cl,1:10,function(x) {
resus=do.call(rbind,parLapply(cl,1:3,function(x) {
	#1 more than the samples coz of the rownames 
	i=lapply(sizes,function(x) c(1,sample(2:x,10)))
	data=lapply(1:length(sizes),function(x) fread(files[x],select=i[[x]]))
	data=lapply(data,function(x) as.matrix(x[,2:ncol(x)],rownames=x$V1))
	data=do.call(cbind,data)
	#separate omics
	data=apply(cbind(c(1,393133,410210),c(393132,410209,410813)),1,
		function(x) t(data[x[1]:x[2],]))
	names(data)=c("CpGs","transcripts","miRNAs")
	describe(data,penalty_cpgs,penalty_transcripts,penalty_mir)}))
stopCluster(cl)
head(resus)
```

## 1.1. Paste all together

```{bash}
#cat *.tsv>penalty_search.tsv
```

## 2. Chosing the penalization values with highest AVE and lowest nfeatures

[choose_penalty.R](https://github.com/CSB-IG/SGCCA/blob/main/choose_penalty.R) → Figure S1

```{r}
suppressPackageStartupMessages({
library(ggplot2)#3.3.5
library(gridExtra)#2.3
})

temp=read.table("penalty_search.tsv",sep='\t',header=T)
temp$omic=factor(temp$omic,levels=c("CpGs","transcripts","miRNAs"))
#temp=temp[order(temp$penalty),]
#plot median AVE vs meadian nfeatures
omics=levels(temp$omic)
omics=lapply(omics,function(x) temp[temp$omic==x,])
omics=lapply(omics,function(x) as.data.frame(apply(x,2,as.numeric)))
names(omics)=levels(temp$omic)
omics=lapply(omics,function(y) sapply(unique(y$penalty),function(x) 
apply(y[y$penalty==x,],2,median,na.rm=T)))#better than mean?
omics=lapply(omics,function(x) as.data.frame(t(x)))
#indi plots or CpGs will determine axis
plots=lapply(1:3,function(x) ggplot(omics[[x]],
	aes(x=nfeatures,y=AVE,col=penalty))+geom_point()+
	ggtitle(names(omics)[x])+theme(text=element_text(size=18))+
	scale_x_continuous(trans="log10")+geom_line())
#png("sparsity_search.png")
 grid.arrange(plots[[1]],plots[[2]],plots[[3]])
#dev.off()
```

## 3. Run an SGCCA per subtype, using the adjusted penalization values

Rscript [sgcca.R](https://github.com/CSB-IG/SGCCA/blob/main/sgcca.R) Her2 → Her2.selected

```{r}
subtype="Her2"#args[1]
data=fread(paste(subtype,"eigeNormi",sep='.'))
data=as.matrix(data[,2:ncol(data)],rownames=data$V1)
#separate omics
data=apply(cbind(c(1,393133,410210),c(393132,410209,410813)),1,
	function(x) t(data[x[1]:x[2],]))
names(data)=c("CpGs","transcripts","miRNAs")
penalty=c(CpGs=0.02,transcripts=0.02,miRNAs=0.05)#output of choose_penalty.R
#ncomp=nrow(data$miRNAs)-1#the last comp has all loadings>0
ncomp=2#exchange #with the line above
final=wrapper.sgcca(X=data,penalty=penalty,scale=F,
	scheme="centroid",ncomp=ncomp)#ncomp to explain 50% of transcripts matrix according to mfa.R
#get selected features
selected=lapply(final$loadings,function(y) 
	apply(y,2,function(x) x[x!=0]))
selected=as.data.frame(do.call(rbind,lapply(selected,function(y) 
	do.call(rbind,lapply(1:length(y),function(x) 
		cbind(names(y)[x],y[[x]],names(y[[x]])))))))
colnames(selected)=c("component","final","variable")
head(selected)
#write.table(selected,paste(subtype,"selected",sep='.'),sep='\t',
#	quote=F,row.names=F)
```

## 4. SGCCA with 100 subsets of the data to check the stability

Rscript [sgccaSubsample.R](https://github.com/CSB-IG/SGCCA/blob/main/sgccaSubsample.R) Her2 1→ Her2.1.selected

The same than sgcca.R but with a random subset of half the data

## 5. Count the features selected on the subsets to check the stability

Rscript [joinSubsamples.R](https://github.com/CSB-IG/SGCCA/blob/main/joinSubsamples.R) Her2 → Her2.subsampled

```{r}
subsampled=read.table("Her2.subsampled",sep='\t',header=T)
dim(subsampled)
subsampled[1:5,1:5]
```