-
Notifications
You must be signed in to change notification settings - Fork 1
/
G4structure_vs_sequence.R
203 lines (142 loc) · 9.75 KB
/
G4structure_vs_sequence.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
# ======= Libraries and working directory
library(data.table)
library(dplyr)
library(ggplot2)
library(ggsignif)
library(ggrepel)
setwd('ENCODE_Analysis/')
# ==== Script parameters
plotdata <- 1
savetables <- 1
# ====== Parameters for Filtering the data ====
qValue_cutoff <- 0.05
PEAK_NUMBER_Cutoff <- 500
Explicit_exclusion <- c("ENCFF782GWS") # ("HNRNPF", "RBM15", "RBM17", "RBM34") Datasets have been flagged in original paper: Cell 2019 (10.1016/j.cell.2019.06.001)
# ====== Load meta data
ENCODE_bed_meta <- read.table(file = "ENCODE_K562_Jan2019_BED_Meta.tsv", sep = '\t', header = TRUE)
ENCODE_bed_meta$Experiment.target <- gsub("-human", "", ENCODE_bed_meta$Experiment.target)
ENCODE_bed_meta <- ENCODE_bed_meta[rowSums(is.na(ENCODE_bed_meta)) != ncol(ENCODE_bed_meta), ] # Remove empty rows
#read in peak numbers
Peak_Number <- read.table(file="GAT_opOQ_noBG4_TSSK562_Jan2019_NumberOfPeaks.txt", sep= '\t', header = FALSE)
colnames(Peak_Number)[1] <- "ENCODE_ID"
colnames(Peak_Number)[2] <- "Peaks"
Peak_Number$ENCODE_ID <- sapply(as.vector(Peak_Number$ENCODE_ID), function(x) {gsub("\\..*","",x)}) # strip extension
ENCODE_bed_meta$Peaks <- Peak_Number$Peaks[match(ENCODE_bed_meta$File.accession, Peak_Number$ENCODE_ID)]
# ================================================================================================================================================================
# ==== Load result tables for four GAT runs using different workspaces
# ================================================================================================================================================================
# ====== open OQS without BG4 signal upstream of TSS
# This are the old sites 5kb upstream + 5UTR
#GAT_opOQ_noBG4_TSS <- read.table(file = "GAT_opOQ_noBG4_TSSGAT.opOQs_noBG4_atTSS.K562.DHS.Rerun_2019.DHS.tsv", sep = '\t', header = TRUE)
# Use 1kb upstream + 5'UTR
GAT_opOQ_noBG4_TSS <- read.table(file = "GAT_opOQ_noBG4_TSSGAT.opOQs_noBG4_1kbupstreamTSS.K562.DHS.Rerun_2019.DHS.tsv", sep = '\t', header = TRUE)
GAT_opOQ_noBG4_TSS <- GAT_opOQ_noBG4_TSS[, 2:11]
colnames(GAT_opOQ_noBG4_TSS)[1] <- "ENCODE_ID"
GAT_opOQ_noBG4_TSS$ENCODE_ID <- gsub(".cut.bed.gz", "", GAT_opOQ_noBG4_TSS$ENCODE_ID)
###Sort by rank and include and extra column 'rank' to see how high the marker scored
GAT_opOQ_noBG4_TSS <- GAT_opOQ_noBG4_TSS[order(GAT_opOQ_noBG4_TSS$fold, decreasing = TRUE), ]
GAT_opOQ_noBG4_TSS$rank <- seq.int(nrow(GAT_opOQ_noBG4_TSS))
# ====== Original shuffling of BG4 peaks in DHS
Gat_DHS <- read.table(file = "GAT_opOQ_noBG4_TSSGAT_K562_BG4_Rerun_Jan2019.DHS.tsv", sep = '\t', header = TRUE)
Gat_DHS <- Gat_DHS[, 2:11]
colnames(Gat_DHS)[1] <- "ENCODE_ID"
Gat_DHS$ENCODE_ID <- gsub(".cut.bed.gz", "", Gat_DHS$ENCODE_ID)
###Sort by rank and include and extra column 'rank' to see how high the marker scored
Gat_DHS <- Gat_DHS[order(Gat_DHS$fold, decreasing = TRUE), ]
Gat_DHS$rank <- seq.int(nrow(Gat_DHS))
# ================================================================================================================================================================
# === Generate a Merged data sheet comprising ENCODE meta data and GAT analysis for the different runs ====
# ================================================================================================================================================================
Merged_all <- ENCODE_bed_meta
GAT_opOQ_noBG4_TSS$Peaks <- Peak_Number$Peaks[match(GAT_opOQ_noBG4_TSS$ENCODE_ID, Peak_Number$ENCODE_ID)]
# wihtout BG4 signal at TSS 5UTR
Merged_all$noBG4atTSS_observed <- GAT_opOQ_noBG4_TSS$observed[match(Merged_all$File.accession, GAT_opOQ_noBG4_TSS$ENCODE_ID)]
Merged_all$noBG4atTSS_expected <- GAT_opOQ_noBG4_TSS$expected[match(Merged_all$File.accession, GAT_opOQ_noBG4_TSS$ENCODE_ID)]
Merged_all$noBG4atTSS_CI95low <- GAT_opOQ_noBG4_TSS$CI95low[match(Merged_all$File.accession, GAT_opOQ_noBG4_TSS$ENCODE_ID)]
Merged_all$noBG4atTSS_CI95high <- GAT_opOQ_noBG4_TSS$CI95high[match(Merged_all$File.accession, GAT_opOQ_noBG4_TSS$ENCODE_ID)]
Merged_all$noBG4atTSS_stddev <- GAT_opOQ_noBG4_TSS$stddev[match(Merged_all$File.accession, GAT_opOQ_noBG4_TSS$ENCODE_ID)]
Merged_all$noBG4atTSS_fold <- GAT_opOQ_noBG4_TSS$fold[match(Merged_all$File.accession, GAT_opOQ_noBG4_TSS$ENCODE_ID)]
Merged_all$noBG4atTSS_l2fold <- GAT_opOQ_noBG4_TSS$l2fold[match(Merged_all$File.accession, GAT_opOQ_noBG4_TSS$ENCODE_ID)]
Merged_all$noBG4atTSS_pvalue <- GAT_opOQ_noBG4_TSS$pvalue[match(Merged_all$File.accession, GAT_opOQ_noBG4_TSS$ENCODE_ID)]
Merged_all$noBG4atTSS_qvalue <- GAT_opOQ_noBG4_TSS$qvalue[match(Merged_all$File.accession, GAT_opOQ_noBG4_TSS$ENCODE_ID)]
Merged_all$noBG4atTSS_rank <- GAT_opOQ_noBG4_TSS$rank[match(Merged_all$File.accession, GAT_opOQ_noBG4_TSS$ENCODE_ID)]
#BG4 peaks shuffled in DHS (open chromatin)
Merged_all$DHS_observed <- Gat_DHS$observed[match(Merged_all$File.accession, Gat_DHS$ENCODE_ID)]
Merged_all$DHS_expected <- Gat_DHS$expected[match(Merged_all$File.accession, Gat_DHS$ENCODE_ID)]
Merged_all$DHS_CI95low <- Gat_DHS$CI95low[match(Merged_all$File.accession, Gat_DHS$ENCODE_ID)]
Merged_all$DHS_CI95high <- Gat_DHS$CI95high[match(Merged_all$File.accession, Gat_DHS$ENCODE_ID)]
Merged_all$DHS_stddev <- Gat_DHS$stddev[match(Merged_all$File.accession, Gat_DHS$ENCODE_ID)]
Merged_all$DHS_fold <- Gat_DHS$fold[match(Merged_all$File.accession, Gat_DHS$ENCODE_ID)]
Merged_all$DHS_l2fold <- Gat_DHS$l2fold[match(Merged_all$File.accession, Gat_DHS$ENCODE_ID)]
Merged_all$DHS_pvalue <- Gat_DHS$pvalue[match(Merged_all$File.accession, Gat_DHS$ENCODE_ID)]
Merged_all$DHS_qvalue <- Gat_DHS$qvalue[match(Merged_all$File.accession, Gat_DHS$ENCODE_ID)]
Merged_all$DHS_rank <- Gat_DHS$rank[match(Merged_all$File.accession, Gat_DHS$ENCODE_ID)]
Merged_all$DHS_rank <- Gat_DHS$rank[match(Merged_all$File.accession, Gat_DHS$ENCODE_ID)]
Merged_all <- Merged_all[rowSums(is.na(Merged_all)) != ncol(Merged_all), ] # remove empty rows
# export all data unfiltered
if (savetables)
{
write.csv(Merged_all, file =paste("GAT_K562_GrichnessAtOQs_Dec2019_Gat-Analysis_all.csv", sep=""))
}
#====== FILTER q-values
# Relevant bed files have already been selected prior to downloading.
# q-Value gives an idea of the reliabiltiy of the shuffling analysis. Only keep cases where shuffling was successfull in case of 'WithBG' and 'noBG4_atTSS' as these will be relevant data sets.
GAT_FILTERED <- Merged_all[ Merged_all$DHS_qvalue < qValue_cutoff , ]
GAT_FILTERED <- GAT_FILTERED[ GAT_FILTERED$noBG4atTSS_qvalue < qValue_cutoff , ]
GAT_FILTERED <- GAT_FILTERED[GAT_FILTERED$Peaks > PEAK_NUMBER_Cutoff, ]
#Remove data sets that have been explicitly flagged
GAT_FILTERED <- GAT_FILTERED[!(GAT_FILTERED$File.accession %in% Explicit_exclusion), ]
# Update relative ranking after removing datasets
GAT_FILTERED <- GAT_FILTERED[order(GAT_FILTERED$noBG4atTSS_fold, decreasing = TRUE), ]
GAT_FILTERED$noBG4atTSS_rank <- seq.int(nrow(GAT_FILTERED))
GAT_FILTERED <- GAT_FILTERED[order(GAT_FILTERED$DHS_fold, decreasing = TRUE), ]
GAT_FILTERED$DHS_rank <- seq.int(nrow(GAT_FILTERED))
if(savetables)
{
write.csv(GAT_FILTERED, file = "GAT_K562_GrichnessAtOQs_Dec2019_Gat-Analysis_FILTERED.csv", row.names = FALSE)
}
# ===========Trimmed and Condensed version of filterd===================================
# Generate a trimmed version only containing most relevant parameters
GAT_FILTERED_trim <- GAT_FILTERED[, c("File.accession", "Experiment.accession", "Experiment.target", "Peaks", "noBG4atTSS_fold", "noBG4atTSS_qvalue", "noBG4atTSS_rank", "DHS_fold", "DHS_qvalue", "DHS_rank")]
# Ratio of DHS vs noBG4atTSS
GAT_FILTERED_trim$ratio <- (GAT_FILTERED_trim$DHS_fold/GAT_FILTERED_trim$noBG4atTSS_fold)
GAT_FILTERED_trim <- GAT_FILTERED_trim[order(GAT_FILTERED_trim$ratio, decreasing = TRUE), ]
GAT_FILTERED_trim$ratio_rank <- seq.int(nrow(GAT_FILTERED))
if(savetables)
{
write.csv(GAT_FILTERED, file = "GAT_K562_GrichnessAtOQs_Dec2019_Gat-Analysis_FILTERED_trim.csv", row.names = FALSE)
}
# =====
# Plots
# =======
#for plots remove all the additional tag information: eGFP, FLAG, etc.
GAT_FILTERED_trim$Experiment.target <- gsub("eGFP-", "", GAT_FILTERED_trim$Experiment.target)
GAT_FILTERED_trim$Experiment.target <- gsub("3xFLAG-", "" , GAT_FILTERED_trim$Experiment.target)
# Label G4 associated proteins
Known_G4_proteins <- read.csv(file = "G4IPD_Oct2019.csv", header = F)
temp <- GAT_FILTERED_trim
temp$G4IPD <- ifelse(temp$Experiment.target %in% Known_G4_proteins$V1, "#aad2a5", "grey50")
# Label with transparent background
gg_grey <- ggplot(temp, aes(x=DHS_fold, y=noBG4atTSS_fold, colour=G4IPD)) +
geom_point() +
theme_minimal()+
labs(title="TF enrichment vs G-richness", y="enrichment of OQS at TSS \n without BG4", x="enrichment at endogenous G4s") +
theme(axis.title.y = element_text(size = rel(1), vjust =2, angle = 90)) +
theme(axis.title.x = element_text(size = rel(1), angle = 0)) +
theme(plot.title = element_text(face = "bold", size = rel(1.2))) +
coord_fixed() +
scale_y_continuous(breaks=seq(0,5,2)) +
scale_x_continuous(breaks=seq(0,11,2)) +
scale_color_manual(values=c("#639b57", "black")) +
labs(color = "") +
theme(legend.position="bottom") +
geom_label_repel(aes(label=ifelse(
(GAT_FILTERED_trim$ratio > 2.7 & GAT_FILTERED_trim$DHS_fold > 6.5), as.character(Experiment.target),'')),
box.padding = 0.4, point.padding = 0.4, size=rel(2.5), segment.color = 'grey80') +
geom_label_repel(aes(label=ifelse(
(GAT_FILTERED_trim$ratio < 0.4 & GAT_FILTERED_trim$noBG4atTSS_fold >1), as.character(Experiment.target) ,'')),
box.padding = 0.4, point.padding = 0.4, size=rel(2.5), segment.color = 'grey20')+
theme(panel.background = element_rect(fill = "transparent"),
plot.background = element_rect(fill = "transparent", color = NA))
plot(gg_grey)
ggsave("figures/GAT_K562_GrichnessAtOQs_Dec2019_vsBG4.pdf", width = 20/2.54, height = 14/2.54, useDingbats=F)