-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathoncoprint2.R
executable file
·261 lines (208 loc) · 9.42 KB
/
oncoprint2.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
library(readxl)
# read in spreadsheet
variants.filename <- dir(data_outside_app_dir, pattern = glob2rx("20*_pdx_dna_variants.xlsx"))
if (length(variants.filename) != 1) stop("too few or too many _pdx_dna_variants.xlsx files in dropbox")
# var_native <- read_excel(file.path(data_outside_app_dir,variants.filename),sheet=1)
variants <- read_excel(file.path(data_outside_app_dir, variants.filename), sheet = 1, col_types = "text")
variants <- as.data.frame(variants)
vs <- variants
# notes:
# tumor_sample_name_new is PDX_Name -- it is truncated and becomes pdx_id in database
# for True Mutation: 0 = benign, 1 = pathologic, 2 = vus
# produce desired form:
cols_keep <- c(
"tumor_sample_name_new",
"Canonical_Variant_Classification", # table 1
"Canonical_Hugo_Symbol", # table 1
"BestEffect_Variant_Classification", # table 2
"BestEffect_Hugo_Symbol", # table 2
"True_Mutation", # want to show just 1s or 1s+2s
"allele_fraction",
"Coverage"
)
v <- vs[cols_keep]
# v = v[v$True_Mutation != 0 & !is.na(v$True_Mutation),]
v <- v[v$True_Mutation == 1 & !is.na(v$True_Mutation), ]
v1 <- v[c(1, 4:8)]
#### --- recode *_Variant_Classification -- ####
# set up constants
snv <- c("Missense_Mutation", "Nonsense_Mutation", "Nonstop_Mutation", "Translation_Start_Site", "Missense")
indel <- c("Frame_Shift_Del", "Frame_Shift_Ins", "In_Frame_Del", "In_Frame_Ins")
splice <- c("Splice_Region", "Splice_Site")
mut_types_keep <- c(snv, indel, splice)
# - for v1 - #
# filter out terms that don't exist, like "0", "intron","intergenic_variant".
v1 <- v1[v1$BestEffect_Variant_Classification %in% mut_types_keep, ]
# Create new columns that map these classifications to the above terms: snv, indel, splice
v1$class2 <- NA_character_
for (i in 1:nrow(v1)) {
class1 <- v1$BestEffect_Variant_Classification[i]
if (class1 %in% snv) {
v1$class2[i] <- "snv"
} else if (class1 %in% indel) {
v1$class2[i] <- "indel"
} else if (class1 %in% splice) {
v1$class2[i] <- "splice"
} else {
stop("Encountered unexpected value.")
}
}
v1$class2 <- as.factor(v1$class2)
table(v1$BestEffect_Variant_Classification, v1$class2)
# # - for v2 - #
# # filter out terms that don't exist, like "0", "intron","intergenic_variant".
# v2 <- v2[v2$BestEffect_Variant_Classification %in% mut_types_keep,]
# # Create new columns that map these classifications to the above terms: snv, indel, splice
# v2$class2 <- NA_character_
# for(i in 1:nrow(v2)){
# class1 <- v2$BestEffect_Variant_Classification[i]
# if (class1 %in% snv){
# v2$class2[i] <- "snv"
# } else if (class1 %in% indel){
# v2$class2[i] <- "indel"
# } else if (class1 %in% splice){
# v2$class2[i] <- "splice"
# } else {
# stop("Encountered unexpected value.")
# }
# }
# v2$class2 <- as.factor(v2$class2)
# table(v2$BestEffect_Variant_Classification,v2$class2)
##### --- pivot into genes x samples --- ####
# - v1 - #
library(dplyr)
v1a <- v1 %>%
dplyr::group_by(tumor_sample_name_new, BestEffect_Hugo_Symbol) %>%
dplyr::summarise(id = paste(class2, collapse = ";"))
# note this looks like: (TODO: is snv;snv;snv a problem?)
# A tibble: 858 x 3
# Groups: tumor_sample_name_new [?]
# tumor_sample_name_new BestEffect_Hugo_Symbol id
# <chr> <chr> <chr>
# 1 CBAB-10855-V2 EPHA6 snv
# 2 CBAB-10855-V2 GATA2 snv
# 3 CBAB-10855-V2 MLL3 snv;snv;snv
# 4 CBAB-10855-V2 PAX5 snv
v1a <- as.data.frame(lapply(v1a, as.factor))
v1a$id <- as.character(v1a$id)
library(reshape2)
v1mat <- acast(v1a, BestEffect_Hugo_Symbol ~ tumor_sample_name_new, value.var = "id")
# produce ComplexHeatmap
if (F) {
col <- c(snv = "red", indel = "blue", splice = "yellow")
png(filename = "tmp/test_oncoprint.png", width = 25, height = 20, units = "in", res = 250)
library(ComplexHeatmap)
ComplexHeatmap::oncoPrint(v1mat,
get_type = function(x) strsplit(x, ";")[[1]],
alter_fun = list(
snv = function(x, y, w, h) grid.rect(x, y, w * 0.9, h * 0.9, gp = gpar(fill = col["snv"], col = NA)),
indel = function(x, y, w, h) grid.rect(x, y, w * 0.9, h * 0.4, gp = gpar(fill = col["indel"], col = NA)),
splice = function(x, y, w, h) grid.rect(x, y, w * 0.5, h * 0.5, gp = gpar(fill = col["splice"], col = NA))
), col = col,
show_column_names = TRUE
)
dev.off()
}
# - v2 - #
## TODO
# - add to dataframe - #
# Template: WT1 Missense NM_024426 c.679A>G p.S227G of 0.481013 in 158 reads
# <gene> <type> <transcriptid> <nt change> <AA change> of <A.F.> in <read support> reads.
# only keep True Mutation == 1
# remove NAs
vdf <- vs[!is.na(vs$True_Mutation), ]
# filter for true mutations, desired columns
annot_cols <- c(
"tumor_sample_name_new", "BestEffect_Hugo_Symbol", "BestEffect_Variant_Classification",
"BestEffect_Refseq_mRNA_Id", "BestEffect_cDNA_Change", "BestEffect_Protein_Change", "allele_fraction", "Coverage"
)
vdf <- vdf[vdf$True_Mutation == 1, annot_cols]
# filter out intergenic
vdf <- vdf[vdf$BestEffect_Variant_Classification != "intergenic_variant", ]
# clean up data
vdf$BestEffect_Variant_Classification <- gsub("_", " ", vdf$BestEffect_Variant_Classification)
vdf$BestEffect_Variant_Classification <- gsub(" Mutation", "", vdf$BestEffect_Variant_Classification)
vdf$BestEffect_Variant_Classification <- gsub("Del", "Deletion", vdf$BestEffect_Variant_Classification)
vdf$BestEffect_Variant_Classification <- gsub("Ins", "Insertion", vdf$BestEffect_Variant_Classification)
vdf$BestEffect_Variant_Classification <- gsub("Frame Shift", "Frameshift", vdf$BestEffect_Variant_Classification)
vdf$BestEffect_Variant_Classification <- gsub("In Frame", "In-Frame", vdf$BestEffect_Variant_Classification)
# filter out nonsensical allele_fractions
vdf$allele_fraction <- as.numeric(vdf$allele_fraction)
vdf <- vdf[between(vdf$allele_fraction, 0, 1), ]
# filter out rows that are completely NA
vdf <- vdf[apply(vdf, 1, function(row) sum(is.na(row))) != ncol(vdf), ]
attach(vdf)
library(scales)
tmp <- paste(BestEffect_Hugo_Symbol, BestEffect_Variant_Classification, BestEffect_Refseq_mRNA_Id, BestEffect_cDNA_Change, BestEffect_Protein_Change, "in", scales::percent(allele_fraction), "of", Coverage, "reads")
detach(vdf)
vdf2 <- vdf
vdf2$PDX_Molecular_Details <- tmp
# replace " NA " with " __ "
vdf2$PDX_Molecular_Details <- gsub(" NA ", " __ ", vdf2$PDX_Molecular_Details)
vdf2$pdx_id <- substring(vdf2$tumor_sample_name_new, 1, 10)
vdf2 <- vdf2[vdf2$pdx_id %in% df$namenum, ]
# group vdf2 by pdx_id and concatenate the following, pipe-separating:
# 1. the BestEffect Hugo Symbol as PDX Molecular Alterations Positive, and
# 2. the PDX Molecular Details as PDX_Molecular Details
vdf_summ <- vdf2 %>%
group_by(pdx_id) %>%
summarise(
"PDX Molecular Alterations Positive" = paste0(BestEffect_Hugo_Symbol, collapse = " | "),
"PDX Molecular Details" = paste0(PDX_Molecular_Details, collapse = " | ")
)
# note: this code supersedes oncoprint.R, so I'm removing the need for that now in compile_upload.R
# TODO later: git rm oncoprint.R, etc.
# complete the merge, moving columns and visibility indexes accordingly
# merge
df <- merge(df, vdf_summ, by.x = "namenum", by.y = "pdx_id", all.x = T)
obInvisRet_ind <- obInvisRet_ind + 1 # accounts for new 'namenum' at column 1 (was near end)
# move 'name' columns from #1,2 out of visible range
cols_to_move <- c("PDX RNA-Seq Name", "namenum")
df <- moveMe(df, cols_to_move, "after", names(df)[obInvisRet_ind])
obInvisRet_ind <- obInvisRet_ind - length(cols_to_move)
# move new mutation columns into visible range in intuitive context
cols_to_move <- c("PDX Molecular Alterations Positive", "PDX Molecular Details")
df <- moveMe(df, cols_to_move, "after", "PDX HemoSeq")
obInvisRet_ind <- obInvisRet_ind + length(cols_to_move)
# Appendix: template example used to build this document
if (F) {
mat <- read.table(textConnection(
",s1,s2,s3
g1,snv;indel,snv,indel
g2,,snv;indel,snv
g3,snv,,indel;snv"
), row.names = 1, header = TRUE, sep = ",", stringsAsFactors = FALSE)
mat <- as.matrix(mat)
mat
library(ComplexHeatmap)
col <- c(snv = "red", indel = "blue")
ComplexHeatmap::oncoPrint(mat,
get_type = function(x) strsplit(x, ";")[[1]],
alter_fun = list(
snv = function(x, y, w, h) grid.rect(x, y, w * 0.9, h * 0.9, gp = gpar(fill = col["snv"], col = NA)),
indel = function(x, y, w, h) grid.rect(x, y, w * 0.9, h * 0.4, gp = gpar(fill = col["indel"], col = NA))
), col = col,
show_column_names = TRUE
)
}
# Ad hoc filter to say 'none' for known negative PDX Molecular Alterations
# pdx_hemo_none <- readxl::read_excel(file.path(data_outside_app_dir,"20180924_pdx_hemoseq_negative_mutations_or_vus.xlsx"))
# pdx_hemo_none <- pdx_hemo_none$`PDX Name`
# if( length(setdiff(pdx_hemo_none,df$`PDX Name`)) != 0 ){
# stop("not all pdx hemoseq negative samples are in df. Probably a PRMS-mismatch.")
# }
# #
# for (pdx in pdx_hemo_none) {
# if(!is.na(df[df$`PDX Name`==pdx,"PDX Molecular Alterations Positive"]) |
# is.na(df[df$`PDX Name`==pdx,"PDX Molecular Details"])) {
# print(paste(pdx,"has some Molecular Alteration info and shouldn't"))
# }
# }
pdx_hemo_none <- df[df$`PDX HemoSeq` == "Complete" &
!is.na(df$`PDX HemoSeq`) &
is.na(df$`PDX Molecular Alterations Positive`) &
is.na(df$`PDX Molecular Details`), ]$`PDX Name`
for (pdx in pdx_hemo_none) {
df[df$`PDX Name` == pdx, "PDX Molecular Alterations Positive"] <- "none detected"
df[df$`PDX Name` == pdx, "PDX Molecular Details"] <- "none detected"
}