-
Notifications
You must be signed in to change notification settings - Fork 5
/
Copy pathomicron-BA_fixing.R
75 lines (61 loc) · 2.49 KB
/
omicron-BA_fixing.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
# Fixing Omicron data
#setwd('~/git/gromstole')
omicron <- read.csv("https://github.com/cov-lineages/pango-designation/files/7668225/Omicron_BA.1_BA.2_mutations.csv",
stringsAsFactors = FALSE)
names(omicron)[c(1,4)] <- c("mut_nuc", "mut_aa")
omicron <- omicron[, -3] # column name is "X", all entries are NA
write.csv(omicron, file = "data/omicron-BA-raw.csv", row.names = FALSE)
head(omicron)
# Finding the Type
omicron$type <- NA
# Short names means mutation
omicron$type[nchar(omicron$mut_nuc) <= 7] <- "~"
omicron$type[grepl(", ", omicron$mut_nuc)] <- "~"
# Deletions and insertions are (mostly) marked
omicron$type[grepl("del", omicron$mut_nuc)] <- "-"
omicron$type[grepl("_", omicron$mut_nuc) & !grepl("(main)", omicron$mut_nuc)] <- "-"
omicron$type[grepl("ins", omicron$mut_nuc)] <- "+"
omicron[, c("type", "mut_nuc", "mut_aa")]
# Finding the position
omicron$pos <- NA
easy_muts <- which(nchar(omicron$mut_nuc) <= 7)
omicron$pos[easy_muts] <- unlist(substr(omicron$mut_nuc[easy_muts], 2,
nchar(omicron$mut_nuc[easy_muts]) - 1))
dels <- which(omicron$type == "-")
omicron$pos[dels] <- sapply(strsplit(omicron$mut_nuc[dels], split = "_"), `[`, 1)
inss <- which(omicron$type == "+")
omicron$pos[inss] <- sapply(strsplit(omicron$mut_nuc[inss], "[GATCins]"), `[`, 1)
omicron[, c("pos", "mut_nuc", "type")]
# Finding Alt (the hard one)
omicron$alt <- NA
omicron$alt[easy_muts] <- substr(omicron$mut_nuc[easy_muts],
nchar(omicron$mut_nuc[easy_muts]),
nchar(omicron$mut_nuc[easy_muts]))
multi_muts <- which(grepl(", ", omicron$mut_nuc))
omicron$alt[multi_muts] <- sapply(strsplit(omicron$mut_nuc[multi_muts], split = ", "),
function(x) paste0(sapply(x,
function(y) {
if(nchar(y) == 0) {
NULL
} else {
substr(y, nchar(y), nchar(y))
}
}), collapse = ""))
omicron$alt[dels] <- sapply(strsplit(omicron$mut_nuc[dels], split = "[_del]"),
function(x) {
as.numeric(x[2]) - as.numeric(x[1]) + 1
})
omicron$alt[inss] <- sapply(strsplit(omicron$mut_nuc[inss], split = "[(0-9)ins]"),
function(x) {
x[nchar(x) > 0]
})
omicron[, c("alt", "mut_nuc")]
# Dealing with (main)
omicron[grepl("(main)", omicron$mut_nuc), c("type", "pos", "alt")] <- c("-", 21987, 9)
# One column for lineage
omicron$lineage <- apply(omicron[, c("BA.1", "BA.2", "B.1.1.529")], 1,
function(x) {
c("BA.1", "BA.2", "B.1.1.529")[which(x == "Y")]
}
)
write.csv(omicron, file = "data/omicron-BA.csv", row.names = FALSE)