-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathpretty_MA_plot.R
155 lines (147 loc) · 6.49 KB
/
pretty_MA_plot.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
#' Simple R script to plot an annotated MA plot for a DESeq2-style DEA results table
#'
#' @param results DESeq2 results() output, can be pre-cast as data frame or not
#' @param convert_ids whether or not to convert from ensembl gene IDs to gene symbol
#' @param id_col the name of the column in your df with ensembl ids. By default, just uses rownames
#' @param mart_name the name of the biomart library to use. Defaults to mouse, ensembl
#' @param name_col the name of the column with gene symbols to be used for plotting. Ignored if convert_ids = T
#' @param lfc_threshold the minimum absolute log2 fold-change for labeling. Defaults to 0 (all significant genes)
#' @param genes genes to subset to
#' @param highlight_genes gene labels to highlight with larger text
#' @param custom_annotation a custom gene conversion set (helpful for metagenomes); must be in biomart format
#' @param max_overlaps passed to geom_label_repel()
#' @param random_seed passed to geom_label_repel
#' @param label_alpha alpha value for gene labels
#' @param label_text_size text size of gene labels
#' @param y_min minimum value of y for resultant plot
#' @param y_max maximum value of y for resultant plot
#' @param label_only_sig if true, label only significant genes from 'genes' argument
#' @return an MA plot generated in ggplot2
#' @import ggplot2
#' @import ggrepel
#' @import biomaRt
#' @import dplyr
#' @import tibble
#' @import tidyverse
#' @export
pretty_MA_plot = function(results,
convert_ids = TRUE,
id_col = "row.names",
mart_name = "mmusculus_gene_ensembl",
name_col = "row.names",
lfc_threshold = 0,
genes = NULL,
highlight_genes = c(),
custom_annotation = NULL,
max_overlaps = 10,
label_alpha = 1,
label_text_size = (10 / .pt),
y_min = NA,
y_max = NA,
label_only_sig = FALSE,
random_seed = 12345)
{
if(convert_ids) #from ensembl to common symbols
{
results = id_convert(results = results,
id_col = id_col,
mart_name = mart_name,
name_col = name_col,
custom_annotation = custom_annotation)
name_col = "external_gene_name"
}
#add colors for significant
results = results %>%
mutate(color = factor(case_when(padj >= 0.05 ~ "NS",
padj < 0.05 & log2FoldChange > 0 ~ "upregulated",
padj < 0.05 & log2FoldChange < 0 ~ "downregulated")))
plt = ggplot(results,
aes(x = baseMean, y = log2FoldChange)) +
geom_point(aes(color = color)) +
scale_x_log10(limits = c(1, NA)) +
scale_color_manual(values = c("NS" = alpha(colour = "grey50", alpha = 0.05),
"upregulated" = "firebrick4",
"downregulated" = "dodgerblue4")) +
theme_bw(base_family = "Arial") +
theme(legend.position = "none") +
xlab("Mean Expression") +
ylab("log2(Fold Change)")
if(is.null(genes))
{
#plot as separate up and down to prevent crossing of the origin
plt = plt +
geom_label_repel(data = subset(results, padj < 0.05 &
abs(log2FoldChange) >= lfc_threshold &
log2FoldChange > 0),
aes(label = .data[[name_col]], size = factor(external_gene_name %in% highlight_genes)),
max.overlaps = max_overlaps,
fill = alpha(c("white"), label_alpha),
ylim = c(1, NA)) +
geom_label_repel(data = subset(results, padj < 0.05 &
abs(log2FoldChange) >= lfc_threshold &
log2FoldChange <= 0),
aes(label = .data[[name_col]], size = factor(external_gene_name %in% highlight_genes)),
max.overlaps = max_overlaps,
fill = alpha(c("white"), label_alpha),
ylim = c(NA, -1)) +
scale_size_manual(values = c("TRUE" = label_text_size * 1.5,
"FALSE" = label_text_size))
} else
{
if(label_only_sig == TRUE)
{
hits = results %>%
dplyr::filter(padj < 0.05) %>%
.$external_gene_name
genes = intersect(genes, hits)
}
plt = plt +
geom_label_repel(data = subset(results, external_gene_name %in% genes & log2FoldChange > 0),
aes(label = .data[[name_col]], size = factor(external_gene_name %in% highlight_genes)),
min.segment.length = 0.1,
max.overlaps = max_overlaps,
fill = alpha(c("white"), label_alpha),
ylim = c(1, NA)) +
geom_label_repel(data = subset(results, external_gene_name %in% genes & log2FoldChange <= 0),
aes(label = .data[[name_col]], size = factor(external_gene_name %in% highlight_genes)),
min.segment.length = 0.1,
max.overlaps = max_overlaps,
fill = alpha(c("white"), label_alpha),
ylim = c(NA, -1)) +
scale_size_manual(values = c("TRUE" = label_text_size * 1.5,
"FALSE" = label_text_size))
}
#edit ylim as necessary
if(!is.na(y_min) || !is.na(y_max))
{
plt = plt +
ylim(y_min, y_max)
}
return(plt)
}
id_convert = function(results,
id_col = "row.names",
mart_name = "mmusculus_gene_ensembl",
name_col = "row.names",
custom_annotation = NULL)
{
require(biomaRt)
if(!is.data.frame(results)) #convert to standard data frame, as necessary
{
results = as.data.frame(results)
}
if(!is.null(custom_annotation))
{
conv = custom_annotation
} else
{
mart = useMart("ensembl", mart_name)
conv = getBM(attributes = c("ensembl_gene_id", "external_gene_name"),
mart = mart)
}
results = rownames_to_column(results, var = id_col)
results = left_join(results,
conv,
by = setNames("ensembl_gene_id", id_col))
return(results)
}