forked from nf-core/chipseq
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathplot_macs3_qc.r
executable file
·155 lines (134 loc) · 6.91 KB
/
plot_macs3_qc.r
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
#!/usr/bin/env Rscript
################################################
################################################
## LOAD LIBRARIES ##
################################################
################################################
library(optparse)
library(ggplot2)
library(reshape2)
library(scales)
################################################
################################################
## PARSE COMMAND-LINE PARAMETERS ##
################################################
################################################
option_list <- list(make_option(c("-i", "--peak_files"), type="character", default=NULL, help="Comma-separated list of peak files.", metavar="path"),
make_option(c("-s", "--sample_ids"), type="character", default=NULL, help="Comma-separated list of sample ids associated with peak files. Must be unique and in same order as peaks files input.", metavar="string"),
make_option(c("-o", "--outdir"), type="character", default='./', help="Output directory", metavar="path"),
make_option(c("-p", "--outprefix"), type="character", default='macs3_peakqc', help="Output prefix", metavar="string"))
opt_parser <- OptionParser(option_list=option_list)
opt <- parse_args(opt_parser)
if (is.null(opt$peak_files)){
print_help(opt_parser)
stop("At least one peak file must be supplied", call.=FALSE)
}
if (is.null(opt$sample_ids)){
print_help(opt_parser)
stop("Please provide sample ids associated with peak files.", call.=FALSE)
}
if (file.exists(opt$outdir) == FALSE) {
dir.create(opt$outdir,recursive=TRUE)
}
PeakFiles <- unlist(strsplit(opt$peak_files,","))
SampleIDs <- unlist(strsplit(opt$sample_ids,","))
if (length(PeakFiles) != length(SampleIDs)) {
print_help(opt_parser)
stop("Number of sample ids must equal number of homer annotated files.", call.=FALSE)
}
################################################
################################################
## READ IN DATA ##
################################################
################################################
plot.dat <- data.frame()
summary.dat <- data.frame()
for (idx in 1:length(PeakFiles)) {
sampleid = SampleIDs[idx]
isNarrow <- FALSE
header <- c("chrom","start","end","name","pileup", "strand", "fold", "-log10(pvalue)","-log10(qvalue)")
fsplit <- unlist(strsplit(basename(PeakFiles[idx]), split='.',fixed=TRUE))
if (fsplit[length(fsplit)] == 'narrowPeak') {
isNarrow <- TRUE
header <- c(header,"summit")
}
peaks <- read.table(PeakFiles[idx], sep="\t", header=FALSE)
colnames(peaks) <- header
## GET SUMMARY STATISTICS
peaks.dat <- peaks[,c('fold','-log10(qvalue)','-log10(pvalue)')]
peaks.dat$length <- (peaks$end - peaks$start)
for (cname in colnames(peaks.dat)) {
sdat <- summary(peaks.dat[,cname])
sdat["num_peaks"] <- nrow(peaks.dat)
sdat["measure"] <- cname
sdat["sample"] <- sampleid
sdat <- t(data.frame(x=matrix(sdat),row.names=names(sdat)))
summary.dat <- rbind(summary.dat,sdat)
}
colnames(peaks.dat) <- c('fold','fdr','pvalue','length')
peaks.dat$name <- rep(sampleid,nrow(peaks.dat))
plot.dat <- rbind(plot.dat,peaks.dat)
}
plot.dat$name <- factor(plot.dat$name, levels=sort(unique(as.character(plot.dat$name))))
SummaryFile <- file.path(opt$outdir,paste(opt$outprefix,".summary.txt",sep=""))
write.table(summary.dat,file=SummaryFile,quote=FALSE,sep="\t",row.names=FALSE,col.names=TRUE)
################################################
################################################
## PLOTS ##
################################################
################################################
## RETURNS VIOLIN PLOT OBJECT
violin.plot <- function(plot.dat,x,y,ylab,title,log) {
plot <- ggplot(plot.dat, aes_string(x=x, y=y)) +
geom_violin(aes_string(colour=x,fill=x), alpha = 0.3) +
geom_boxplot(width=0.1) +
xlab("") +
ylab(ylab) +
ggtitle(title) +
theme(legend.position="none",
panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
panel.background = element_blank(),
axis.text.y = element_text(colour="black"),
axis.text.x= element_text(colour="black",face="bold"),
axis.line.x = element_line(size = 1, colour = "black", linetype = "solid"),
axis.line.y = element_line(size = 1, colour = "black", linetype = "solid"))
if (log == 10) {
plot <- plot + scale_y_continuous(trans='log10',breaks = trans_breaks("log10", function(x) 10^x), labels = trans_format("log10", math_format(10^.x)))
}
if (log == 2) {
plot <- plot + scale_y_continuous(trans='log2',breaks = trans_breaks("log2", function(x) 2^x), labels = trans_format("log2", math_format(2^.x)))
}
return(plot)
}
############################
PlotFile <- file.path(opt$outdir,paste(opt$outprefix,".plots.pdf",sep=""))
pdf(PlotFile,height=6,width=3*length(unique(plot.dat$name)))
## PEAK COUNT PLOT
peak.count.dat <- as.data.frame(table(plot.dat$name))
colnames(peak.count.dat) <- c("name","count")
plot <- ggplot(peak.count.dat, aes(x=name, y=count)) +
geom_bar(stat="identity",aes(colour=name,fill=name), position = "dodge", width = 0.8, alpha = 0.3) +
xlab("") +
ylab("Number of peaks") +
ggtitle("Peak count") +
theme(legend.position="none",
panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
panel.background = element_blank(),
axis.text.y = element_text(colour="black"),
axis.text.x= element_text(colour="black",face="bold"),
axis.line.x = element_line(size = 1, colour = "black", linetype = "solid"),
axis.line.y = element_line(size = 1, colour = "black", linetype = "solid")) +
geom_text(aes(label = count, x = name, y = count), position = position_dodge(width = 0.8), vjust = -0.6)
print(plot)
## VIOLIN PLOTS
print(violin.plot(plot.dat=plot.dat,x="name",y="length",ylab=expression(log[10]*" peak length"),title="Peak length distribution",log=10))
print(violin.plot(plot.dat=plot.dat,x="name",y="fold",ylab=expression(log[2]*" fold-enrichment"),title="Fold-change distribution",log=2))
print(violin.plot(plot.dat=plot.dat,x="name",y="fdr",ylab=expression(-log[10]*" qvalue"),title="FDR distribution",log=-1))
print(violin.plot(plot.dat=plot.dat,x="name",y="pvalue",ylab=expression(-log[10]*" pvalue"),title="Pvalue distribution",log=-1))
dev.off()
################################################
################################################
################################################
################################################