Skip to content

Commit cdb3759

Browse files
authored
Create CTCF_maxATAC_QC.Rmd
0 parents  commit cdb3759

File tree

1 file changed

+219
-0
lines changed

1 file changed

+219
-0
lines changed

CTCF_maxATAC_QC.Rmd

+219
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,219 @@
1+
---
2+
title: "Untitled"
3+
output: html_document
4+
---
5+
6+
```{r setup, include=FALSE}
7+
library(GenomicRanges)
8+
library(data.table)
9+
10+
knitr::opts_chunk$set(echo = TRUE)
11+
12+
codex_dir="/Users/javrodher/Work/biodata/LOLA/nm/t1/resources/regions/LOLACore/hg38/codex/regions/"
13+
encode_dir="/Users/javrodher/Work/biodata/LOLA/nm/t1/resources/regions/LOLACore/hg38/encode_tfbs/regions/"
14+
15+
outdir="/Users/javrodher/Work/RStudio-PRJs/TCGA_PanCan/results/"
16+
17+
```
18+
19+
```{r}
20+
f_codex=list.files(codex_dir,pattern = "CTCF")
21+
f_encode=list.files(encode_dir,pattern = "Ctcf")
22+
index_codex_file="/Users/javrodher/Work/biodata/LOLA/nm/t1/resources/regions/LOLACore/hg38/codex/index.txt"
23+
index_encode_file="/Users/javrodher/Work/biodata/LOLA/nm/t1/resources/regions/LOLACore/hg38/encode_tfbs/index.txt"
24+
25+
index_codex=read.delim(index_codex_file)
26+
index_encode=read.delim(index_encode_file)
27+
28+
index_codex_ctcf = index_codex[index_codex$antibody=="CTCF",]
29+
index_encode_ctcf = index_encode[index_encode$antibody=="CTCF",]
30+
31+
index_codex_ctcf$mappingGenome
32+
index_encode_ctcf=index_encode_ctcf[index_encode_ctcf$treatment=="None",]
33+
table(index_encode_ctcf$cellType)
34+
35+
fnames=index_encode_ctcf$filename
36+
37+
X=data.frame(V1=NA,V2=NA,V3=NA,V4=NA,stringsAsFactors = F)
38+
for (fname in fnames){
39+
#fname=fnames[1]
40+
41+
print(fname)
42+
x = read.delim(paste0(encode_dir,"/",fname),header = F)
43+
x$V4=fname
44+
X=rbind(X,x)
45+
}
46+
47+
X=X[-1,]
48+
table(X$V4)
49+
50+
write.table(X[,1:3],paste0(outdir,"/","ctcf_encode_LOLA_hg38.bed"),quote = F,sep="\t",col.names = F,row.names = F)
51+
```
52+
53+
```{r}
54+
# sort -k1,1 -k2,2n ctcf_encode_LOLA_hg38.bed > ctcf_encode_sorted.bed
55+
# bedtools merge -i ctcf_encode_sorted.bed > ctcf_encode_merged.bed
56+
# wc -l ctcf_encode_merged.bed
57+
```
58+
59+
```{r}
60+
inpdir="/Users/javrodher/Work/RStudio-PRJs/TCGA_PanCan/data/maxATAC/CTCF/"
61+
top_list=c("all",100000,90000,80000,70000,60000,50000,40000,30000,20000,10000)
62+
63+
ctcf_peaks = read.delim(paste0(outdir,"/ctcf_encode_merged.bed"),header = F)
64+
names(ctcf_peaks)=c("chr","start","end")
65+
#quantile(ctcf_peaks$end-ctcf_peaks$start)
66+
samples=list.dirs(inpdir,full.names = F,recursive = F)
67+
68+
X=data.frame(V1=NA,V2=NA,V3=NA,V4=NA,V5=NA,V6=NA,stringsAsFactors = F)
69+
for (sample in samples){
70+
#sample=samples[1]
71+
print(sample)
72+
x=read.delim(paste0(inpdir,"/",sample,"/maxatac_predict_32bp.bed"),header = F)
73+
x=x[order(x$V4,decreasing = T),]
74+
x$V5=sample
75+
x$V6=1:nrow(x)
76+
X=rbind(X,x)
77+
}
78+
79+
fx1 = function(sample,inpdir){
80+
#sample=samples[1]
81+
#print(sample)
82+
x=read.delim(paste0(inpdir,"/",sample,"/maxatac_predict_32bp.bed"),header = F)
83+
x=x[order(x$V4,decreasing = T),]
84+
x$V5=sample
85+
x$V6=1:nrow(x)
86+
return(x)
87+
}
88+
89+
res = parallel::mclapply(samples,fx1,inpdir=inpdir,mc.cores = 5)
90+
predicted_ctcf=do.call(rbind.data.frame,res)
91+
names(predicted_ctcf)=c("chr","start","end","score","sample","rank")
92+
table(predicted_ctcf$sample)
93+
predicted_ctcf$celltype=substring(predicted_ctcf$sample,1,4)
94+
95+
#table(predicted_ctcf$sample)
96+
#fwrite(predicted_ctcf,paste0(outdir,"/","predicted_ctcf_allSamples.csv"),row.names=F)
97+
predicted_ctcf = fread(paste0(outdir,"/","predicted_ctcf_allSamples.csv"))
98+
99+
ctcf_peaks_lola.gr=makeGRangesFromDataFrame(ctcf_peaks,keep.extra.columns = T)
100+
101+
DF=data.frame(sample=NA,tps=NA,fps=NA,n_peaks=NA,frac_tps=NA,top=NA,stringsAsFactors = F)
102+
for (top in top_list){
103+
#top="all"
104+
print(top)
105+
106+
if(top != "all") {
107+
predicted=predicted_ctcf[predicted_ctcf$rank %in% 1:top,]
108+
} else {predicted = predicted_ctcf }
109+
predicted.gr=makeGRangesFromDataFrame(predicted,keep.extra.columns = T)
110+
ovl=as.data.frame(findOverlaps(predicted.gr,ctcf_peaks_lola.gr))
111+
predicted$overlap_lola=F
112+
predicted$overlap_lola[unique(ovl$queryHits)]=T
113+
#table(predicted$overlap_lola)
114+
115+
tps = as.data.frame(table(predicted$sample[predicted$overlap_lola==T]))
116+
fps = as.data.frame(table(predicted$sample[predicted$overlap_lola==F]))
117+
df=data.frame(sample=tps$Var1,tps=tps$Freq,fps=fps$Freq,stringsAsFactors = F)
118+
df$n_peaks=df$tps+df$fps
119+
df$frac_tps=df$tps/df$n_peaks
120+
df$top=top
121+
DF=rbind(DF,df)
122+
}
123+
DF=DF[-1,]
124+
DF$celltype=substring(DF$sample, 1, 4)
125+
table(DF$celltype)
126+
fwrite(DF,paste0(outdir,"/","predicted_ctcf_stats.csv"),row.names=F)
127+
DF$top=as.factor(DF$top)
128+
library(gridGraphics)
129+
130+
DF$top=factor(DF$top,levels=c("all",seq(100000,10000,-10000)))
131+
132+
ggplot(DF,aes(x=frac_tps))+
133+
geom_histogram()+
134+
facet_wrap(~top)+
135+
theme_bw()+
136+
geom_vline(xintercept = 0.4,color="red")
137+
```
138+
139+
```{r}
140+
ggplot(predicted_ctcf,aes(x=sample,y=score))+
141+
geom_boxplot()+
142+
facet_wrap(~celltype,scales = "free_x",ncol = 1)+
143+
theme(axis.text.x = element_blank())+
144+
ylab("Peak score")+
145+
xlab("Samples")
146+
```
147+
148+
149+
```{r}
150+
df_score = predicted_ctcf %>% group_by(sample) %>% summarise(median_score=median(score))
151+
df_score$celltype=substring(df_score$sample, 1, 4)
152+
153+
ggplot(df_score,aes(x=sample,y=median_score))+
154+
geom_col()+
155+
facet_wrap(~celltype,scales = "free_x")+
156+
theme_bw()+
157+
theme(axis.text.x = element_blank())+
158+
ylab("Peak score (Median)")+
159+
xlab("Samples")
160+
161+
```
162+
163+
```{r}
164+
ggplot(DF,aes(x=top,y=frac_tps))+
165+
geom_boxplot()+
166+
theme_bw()
167+
```
168+
169+
```{r}
170+
ggplot(DF,aes(x=top,y=frac_tps))+
171+
geom_boxplot()+
172+
facet_wrap(~celltype)+
173+
theme_bw()+
174+
theme(axis.text.x = element_text(angle=90, hjust=1))+
175+
ylab("True positives (fraction)")+
176+
xlab("Peak number")
177+
178+
```
179+
180+
```{r}
181+
DF[DF$top=="all",] %>% group_by(celltype) %>% summarise(n=n())
182+
```
183+
184+
```{r}
185+
ggplot(DF[DF$top=="all",],aes(x=celltype,y=n_peaks))+
186+
geom_boxplot()
187+
188+
ggplot(DF[DF$top=="all",],aes(x=sample,y=n_peaks))+
189+
geom_col()+
190+
facet_wrap(~celltype,scales = "free_x")+
191+
theme_bw()+
192+
theme(axis.text.x = element_blank())+
193+
ylab("Predicted CTCF peaks (count)")+
194+
xlab("Samples")
195+
```
196+
197+
198+
199+
200+
201+
```{r}
202+
# number of predicted peaks per cancer type
203+
# number of tps per cancer type per top selection
204+
# scores of predicted peaks per cancer type
205+
```
206+
207+
```{r}
208+
x=as.data.frame(table(predicted_ctcf$sample))
209+
quantile(x$Freq)
210+
hist(x$Freq,breaks = 50)
211+
quantile(predicted_ctcf$score[!is.na(predicted_ctcf$score)],na.rm=F)
212+
```
213+
214+
```{r}
215+
ggplot(predicted_ctcf[predicted_ctcf$sample %in% samples[1:20],],aes(x=score))+
216+
geom_histogram()+
217+
facet_wrap(~sample)
218+
```
219+

0 commit comments

Comments
 (0)